In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scirpy as ir
from matplotlib import pyplot as plt, cm as mpl_cm
from cycler import cycler
from tcr_processing import *
from dextramer_norm import dextramer_normalise
import datetime
import csv

sc.set_figure_params(figsize=(4, 4))
sc.settings.verbosity = 2  # verbosity: errors (0), warnings (1), info (2), hints (3)

In [2]:
class TCR:
    '''Create a class object for loading, processing 
        and clustering 10X TCR data with Scirpy'''
    
    def __init__(self, parameters):
        print('Initialising class object')

        assert type(parameters)==dict

        self.params = parameters
        
    
    def load_data(self):

        '''Load 10X transcriptomic and cell binding data'''
        
        print('Loading TCR binding and transcriptomics data')
        
        contigs, feature_matrix= [self.params[x] for x in ['contigs','bc_matrix']]
        
        if not contigs[-3:]=='csv':
            raise TypeError('Contigs file should be in csv format')

        if not feature_matrix[-2:]=='h5':
            raise TypeError('Matrix file should be in h5 format')
        
        # Load TCR data
        adata_tcr = ir.io.read_10x_vdj(contigs)

        # Load associated transcriptomics data
        adata = sc.read_10x_h5(feature_matrix)

        # Merge
        ir.pp.merge_with_ir(adata, adata_tcr)

        print("Data loaded to Ann Object with shape: ",adata.shape)

        self.adata=adata
        print('AnnData object with %s entries at stage 1'%(self.adata.n_obs))
        # self.adata.obs.index = [x+'_'+self.params['subj'].split(':')[0] for x in adata.obs.index]
        self.N= [adata.n_obs]

    def drop_dead(self):
        '''Remove cells above a set % of mitochondrial gene expression 
            or falling above a total gene count threshold'''
        
        max_genes, mito_cutoff= [self.params[x] for x in ['max_genes','mito_cutoff']]
        
        if mito_cutoff:
            print('Eliminating cells above mitochondrial gene expression threshold')
            self.adata.var['mt'] = self.adata.var_names.str.startswith('MT-')  # Generates a boolean mask for genes starting with 'MT'
            sc.pp.calculate_qc_metrics(self.adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)  # Returns total_counts_['mt'] for vars fed in via qc_vars
            self.adata = self.adata[self.adata.obs.pct_counts_mt < mito_cutoff, :]
        print('AnnData object with %s entries at stage 4'%(self.adata.n_obs))
        self.N= self.N+ [self.adata.n_obs]

        if max_genes:
            print('Eliminating suspected doublets')
            self.adata = self.adata[self.adata.obs.n_genes_by_counts < max_genes, :]
        print('AnnData object with %s entries at stage 5'%(self.adata.n_obs))
        self.N= self.N+ [self.adata.n_obs]

    def preprocess(self):
        '''Preprocess single cell data and conduct TCR chain quality control'''

        if self.params['preprocess']==True:

            min_cells, min_genes = [self.params[x] for x in ['min_cells','min_genes']]
        
            # Filter dead and doublet cells
            
            if min_cells:
                sc.pp.filter_genes(self.adata, min_cells=min_cells)
            print('AnnData object with %s entries at stage 2'%(self.adata.n_obs))

            self.N= self.N+ [self.adata.n_obs]

            if min_genes:
                sc.pp.filter_cells(self.adata, min_genes=min_genes)

            print('AnnData object with %s entries at stage 3'%(self.adata.n_obs))
            self.N= self.N+ [self.adata.n_obs]
            self.drop_dead()

            # Normalise count data
            sc.pp.normalize_per_cell(self.adata, counts_per_cell_after=1000)
            sc.pp.log1p(self.adata)

        # QC of TCR chains
        print('AnnData object with %s entries at stage 6'%(self.adata.n_obs))
        self.N= self.N+ [self.adata.n_obs]

        ir.tl.chain_qc(self.adata)
        
        print(
            "Fraction of cells with more than one pair of TCRs: {:.2f}".format(
                np.sum(self.adata.obs["chain_pairing"].isin(
                        ["extra VJ", "extra VDJ", "two full chains"])) / self.adata.n_obs))
        print('Dropping multichain and single chain receptors')

        # Drop multichain instances
        self.adata = self.adata[self.adata.obs["chain_pairing"] != "multichain", :].copy()

        # Drop single chain instances
        self.adata = self.adata[~self.adata.obs["chain_pairing"].isin(["orphan VDJ", "orphan VJ","no IR"]), :].copy()
        
        print('AnnData object with %s entries at stage 7'%(self.adata.n_obs))
        self.N= self.N+ [self.adata.n_obs]
        self.params['N']=self.N
    
    def get_epitopes(self):

        '''Extract epitope map from binarized matrix file'''
        binarized_matrix, contigs, epitopes_loadfile, epitopes_outfile, dexnorm =[self.params[x] for x in ['binarized_matrix',
                                                                                            'contigs',
                                                                                            'epitopes_loadfile',
                                                                                            'epitopes_outfile',
                                                                                            'dextramer_normalise']]

        if dexnorm:
            epitopes=dextramer_normalise(binarized_matrix,contigs)

        else:
            if epitopes_loadfile:
                epitopes=load_pickle(epitopes_loadfile)
        
            elif epitopes_outfile and not epitopes_loadfile:

                print('Loading binary matrix')
                binary_matrix=pd.read_csv(binarized_matrix)
                print(binary_matrix.columns)
                print(binary_matrix.head())
                sub=binary_matrix[binary_matrix.columns[68:]]   # Check this for other donors
                sub.set_index(binary_matrix['barcode'].values,inplace=True)
                print('Reading epitope specificity from %s cells'%(len(sub)))
                eps=[sub.iloc[i][sub.iloc[i]==True] for i in range(len(sub))]
                assert len(eps)==len(sub)
                epitopes={}
                for i in range(len(eps)):
                    try:
                        epitopes[eps[i].name]=eps[i].index[0].strip('_binder')
                    except IndexError:
                        epitopes[eps[i].name]='None'
                save_pickle(epitopes,epitopes_outfile)
                
            else:
                raise ValueError("You must specify either a pre-existing epitope file, or an epitopes outfile location")

        self.epitopes=epitopes

    def get_clusters(self):

        '''Generate baseline clusters using scirpy clustering function'''

        dist=self.params['cluster_distance']
        if dist not in ['identity', 'hamming', 'levenshtein']:
            raise ValueError('Enter a distance metric from "identity", "hamming", "levenshtein"')

        ir.pp.ir_dist(
            self.adata,
            metric=dist,
            cutoff=1,
            sequence="aa",
        )
        ir.tl.define_clonotype_clusters(self.adata, sequence='aa', metric=dist, receptor_arms='all', dual_ir='primary_only', same_v_gene=False, n_jobs=4)
        ir.tl.clonotype_network(self.adata, sequence='aa', metric=dist,min_cells=2) 
        idx = {x:k for k in self.adata.uns['cc_aa_%s'%(dist)]['cell_indices'].keys() for x in self.adata.uns['cc_aa_%s'%(dist)]['cell_indices'][k]}
        clusts= []
        tcr=self.adata.uns['TCRs']
        for i in range(len(tcr)):
            barcode = tcr.iloc[i].name
            if barcode in idx.keys():
                clusts.append(idx[barcode])
            else:
                clusts.append('None')
        assert len(clusts)==len(tcr)
        tcr['scirpy_%s'%(dist)]=clusts
        self.adata.uns['TCRs']=tcr
            
    def get_TCRs(self):

        '''Extract TCRs from Ann object, and map to epitope specificity'''
        TCR = self.adata.obs[['IR_VDJ_1_junction_aa','IR_VDJ_1_v_call','IR_VDJ_1_j_call',
                              'IR_VDJ_1_consensus_count','IR_VDJ_1_productive','IR_VDJ_1_duplicate_count',
                              'IR_VJ_1_junction_aa','IR_VJ_1_v_call','IR_VJ_1_j_call',
                              'IR_VJ_1_consensus_count','IR_VJ_1_productive','IR_VJ_1_duplicate_count']].copy()
        
        TCR.columns=['cdr3.beta','v.beta','j.beta','count_beta','productive_beta','duplicates_beta',
                     'cdr3.alpha','v.alpha','j.alpha','count_alpha','productive_alpha','duplicates_alpha']

        epitope_dict=self.epitopes
        eps=[]
        for i in range(len(TCR)):
                if TCR.iloc[i].name in epitope_dict.keys():
                    eps.append(epitope_dict[TCR.index[i]])
                else:
                    eps.append('None')
        TCR['Epitope']=eps
        TCR['subject:condition']=self.params['subj']

        self.adata.uns['TCRs']=TCR

        if self.params['cluster_distance']:
            self.get_clusters()
        else:
            self.params['cluster_distance']='None'
        
        if self.params['tcr_outfile']:
            print('Saving')
            TCR.to_pickle(self.params['tcr_outfile'])

def write_record(record,parameters):
    '''Write parameters to csv file'''
    print('Recording parameters to: ',record)
    with open(record,'a') as f:
        writer=csv.writer(f)
        writer.writerow(list(parameters.values()))    


In [3]:
def run(parameter_dict,record):
    tcr_obj=TCR(parameter_dict)
    tcr_obj.load_data()
    tcr_obj.preprocess()
    tcr_obj.get_epitopes()
    tcr_obj.get_TCRs()
    if parameter_dict['h5_outfile']:
        tcr_obj.adata.write_h5ad(parameter_dict['h5_outfile'])
    
    write_record(record,tcr_obj.params)
    print('Complete\n')
    

In [4]:
# Execute for single donor

time=datetime.datetime.now().strftime('%Y%m%d_%H%M')

for d in range(2,3):

    donor = str(d)
    record='data/record_merged.csv'
    parameter_dict=dict(
    tcr_outfile="%s_TCRs_donor_%s.pkl"%(time,donor),
    contigs="data/donor%s/vdj_v1_hs_aggregated_donor%s_all_contig_annotations.csv"%(donor,donor),
    bc_matrix="data/donor%s/vdj_v1_hs_aggregated_donor%s_filtered_feature_bc_matrix.h5"%(donor,donor),
    binarized_matrix='data/donor%s/vdj_v1_hs_aggregated_donor%s_binarized_matrix.csv'%(donor,donor),
    preprocess=False,
    dextramer_normalise=True,
    min_cells=10,
    min_genes=200,
    max_genes=False,
    mito_cutoff=30,
    epitopes_loadfile=None,
    epitopes_outfile="%s_epitopes_donor%s.pkl"%(time,donor),
    h5_outfile="%s_TCRs_donor_%s.h5ad"%(time,donor),
    subj='donor%s:healthy'%(donor),
    organism='homo sapiens',
    cluster_distance=None)

# run(parameter_dict,record)


In [5]:
tcr_obj=TCR(parameter_dict)
tcr_obj.load_data()
tcr_obj.preprocess()
tcr_obj.get_epitopes()

Initialising class object
Loading TCR binding and transcriptomics data
reading data/donor2/vdj_v1_hs_aggregated_donor2_filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


 (0:00:05)


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'feature_types' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'genome' as categorical


Data loaded to Ann Object with shape:  (91921, 33538)
AnnData object with 91921 entries at stage 1
AnnData object with 91921 entries at stage 6
Fraction of cells with more than one pair of TCRs: 0.07
Dropping multichain and single chain receptors


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


AnnData object with 61236 entries at stage 7
Reading in dextramer binding data
Correcting for negative dextramer binding
Normalising binding signals per cell
Reading in clonotype data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['clonotype']=[clondict[barcode] for barcode in dataframe['barcode'].unique()]


Normalising binding signals across clonotypes
Combining batch and clonotype normalisation
{'AAACCTGAGAAACCTA-30': 'B0801_RAKFKQLL_BZLF1_EBV', 'AAACCTGAGAACTCGG-39': 'A0301_KLGGALQAK_IE-1_CMV', 'AAACCTGAGAAGGGTA-9': 'B0801_RAKFKQLL_BZLF1_EBV', 'AAACCTGAGACACTAA-40': 'A0201_LLDFVRFMGV_EBNA-3B_EBV', 'AAACCTGAGACGCTTT-11': 'B0801_RAKFKQLL_BZLF1_EBV', 'AAACCTGAGACTGGGT-33': 'A0201_GILGFVFTL_Flu-MP_Influenza', 'AAACCTGAGACTTGAA-13': 'B0801_RAKFKQLL_BZLF1_EBV', 'AAACCTGAGAGACTAT-30': 'B0801_RAKFKQLL_BZLF1_EBV', 'AAACCTGAGAGCCTAG-28': 'B0801_RAKFKQLL_BZLF1_EBV', 'AAACCTGAGAGGTAGA-6': 'B0801_RAKFKQLL_BZLF1_EBV', 'AAACCTGAGAGGTTAT-18': 'A0201_KLQCVDLHV_PSA146-154', 'AAACCTGAGAGTACCG-14': 'A1101_IVTDFSVIK_EBNA-3B_EBV', 'AAACCTGAGAGTGACC-30': 'B0801_RAKFKQLL_BZLF1_EBV', 'AAACCTGAGATAGCAT-5': 'B0801_RAKFKQLL_BZLF1_EBV', 'AAACCTGAGATGAGAG-29': 'A0201_GILGFVFTL_Flu-MP_Influenza', 'AAACCTGAGCAATATG-11': 'A0201_GILGFVFTL_Flu-MP_Influenza', 'AAACCTGAGCAATCTC-27': 'A0301_RIAAWMATY_BCL-2L1_Cancer', 'AAACC

In [6]:
tcr_obj.epitopes

{'AAACCTGAGAAACCTA-30': 'B0801_RAKFKQLL_BZLF1_EBV',
 'AAACCTGAGAACTCGG-39': 'A0301_KLGGALQAK_IE-1_CMV',
 'AAACCTGAGAAGGGTA-9': 'B0801_RAKFKQLL_BZLF1_EBV',
 'AAACCTGAGACACTAA-40': 'A0201_LLDFVRFMGV_EBNA-3B_EBV',
 'AAACCTGAGACGCTTT-11': 'B0801_RAKFKQLL_BZLF1_EBV',
 'AAACCTGAGACTGGGT-33': 'A0201_GILGFVFTL_Flu-MP_Influenza',
 'AAACCTGAGACTTGAA-13': 'B0801_RAKFKQLL_BZLF1_EBV',
 'AAACCTGAGAGACTAT-30': 'B0801_RAKFKQLL_BZLF1_EBV',
 'AAACCTGAGAGCCTAG-28': 'B0801_RAKFKQLL_BZLF1_EBV',
 'AAACCTGAGAGGTAGA-6': 'B0801_RAKFKQLL_BZLF1_EBV',
 'AAACCTGAGAGGTTAT-18': 'A0201_KLQCVDLHV_PSA146-154',
 'AAACCTGAGAGTACCG-14': 'A1101_IVTDFSVIK_EBNA-3B_EBV',
 'AAACCTGAGAGTGACC-30': 'B0801_RAKFKQLL_BZLF1_EBV',
 'AAACCTGAGATAGCAT-5': 'B0801_RAKFKQLL_BZLF1_EBV',
 'AAACCTGAGATGAGAG-29': 'A0201_GILGFVFTL_Flu-MP_Influenza',
 'AAACCTGAGCAATATG-11': 'A0201_GILGFVFTL_Flu-MP_Influenza',
 'AAACCTGAGCAATCTC-27': 'A0301_RIAAWMATY_BCL-2L1_Cancer',
 'AAACCTGAGCAGGCTA-26': 'B0801_RAKFKQLL_BZLF1_EBV',
 'AAACCTGAGCATCATC-34': 'A03

In [None]:
time=datetime.datetime.now().strftime('%Y%m%d_%H%M')

for d in range(1,5):
    print('*** DONOR %s ***'%(str(d)))
    donor = str(d)
    record='data/record_merged.csv'
    parameter_dict=dict(
    tcr_outfile="%s_TCRs_donor_%s.pkl"%(time,donor),
    contigs="data/donor%s/vdj_v1_hs_aggregated_donor%s_all_contig_annotations.csv"%(donor,donor),
    bc_matrix="data/donor%s/vdj_v1_hs_aggregated_donor%s_filtered_feature_bc_matrix.h5"%(donor,donor),
    binarized_matrix='data/donor%s/vdj_v1_hs_aggregated_donor%s_binarized_matrix.csv'%(donor,donor),
    preprocess=False,
    min_cells=10,
    min_genes=200,
    max_genes=False,
    mito_cutoff=30,
    dextramer_normalise=True,
    epitopes_loadfile=None,
    epitopes_outfile="%s_epitopes_donor%s.pkl"%(time,donor),
    h5_outfile=None,
    # h5_outfile="%s_TCRs_donor_%s.h5ad"%(time,donor),
    subj='donor%s:healthy'%(donor),
    organism='homo sapiens',
    cluster_distance=None)

    run(parameter_dict,record)

In [None]:
def merge_donors(root,datetime_tag,parameter_dict,record):
    merged=pd.DataFrame()
    for donor in range(1,5):
        input_file=root+'%s_TCRs_donor_%s.pkl'%(datetime_tag,str(donor))
        df = load_pickle(input_file)
        merged=pd.concat([merged,df])
    print('Saving merged files')
    savefile='%s_TCRs_merged.csv'%(datetime_tag)
    merged.to_csv(savefile)
    parameter_dict['tcr_outfile']=savefile
    print('Exporting record')
    for key in ['contigs','bc_matrix','binarized_matrix','epitopes_outfile','h5_outfile','subj']:
        parameter_dict[key]='Multiple'
    if not parameter_dict['cluster_distance']:
        parameter_dict['cluster_distance']='None'
    write_record(record,parameter_dict)    

root='/Users/danhudson/Documents/Academic/Oxford/Oxford_DPhil/Rotation_Projects/Hashem/10X_pipeline/'


# datetime_tag='20220202_1409'
for tag in ['20220308_1006']:
    merge_donors(root,tag,parameter_dict,record)

In [None]:
df=pd.read_csv('20220308_1006_TCRs_merged.csv')
df['Epitope'].unique()

In [None]:
# filter out duplicates and nans

loadfile=root+time+'_TCRs_merged.pkl'
merged=load_pickle(loadfile)
filt=merged.dropna().drop_duplicates()
print('Dropping %s files of %s'%(len(merged)-len(filt),len(merged)))
print('Saving')
filt.to_pickle(root+time+'_TCRs_merged_filtered.pkl')
print('Complete')

In [None]:
input_data=merged.copy()
for donor in range(1,5):
        df = input_data[input_data['subject:condition']=='donor%s:healthy'%(str(donor))]
        dups=len(df)-len(df.drop_duplicates())
        nans = len(df)-len(df.dropna())
        print('Removing %s duplicates and %s NaNs for donor %s'%(dups,nans,donor))

input_data=input_data.dropna().drop_duplicates()
for variable in ['v.beta','j.beta','v.alpha','j.alpha']:
    plt.figure(figsize=(12,3))
    for donor in range(1,5):
        df = input_data[input_data['subject:condition']=='donor%s:healthy'%(str(donor))]
        
        plt.hist(df[variable].values.astype(str),label='donor %s'%(donor))

    plt.grid(False)
    plt.xticks(rotation=90)
    plt.title(variable.upper())
    
    plt.legend()
    plt.show()
    

In [None]:
header=['tcr_outfile','contigs','bc_matrix','binarized_matrix','preprocess','min_cells','min_genes','max_genes','mito_cutoff','epitopes_loadfile','epitopes_outfile','h5_outfile','subj','organism','cluster_distance']
with open('data/record_merged.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow(header)