In [2]:
import pandas as pd
from scipy import stats

In [20]:
# Load the counts matrix
counts_file = '../data/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv'
counts = pd.read_csv(counts_file,sep=',',index_col=0)
# rows are gene IDs, columns are cell barcodes


In [25]:
# determine sample labels
labels_file = '../data/GSE100866_CBMC_8K_13AB_10X-ADT_clr-transformed.csv'
labels = pd.read_csv(labels_file,sep=',',index_col=0)
# rows are cell surface markers, columns are cell barcodes

In [28]:
# extract the gene IDs relevant to each cell surface marker
gene_ids = counts.index
# try to find genes that contain marker names
marker_names = labels.index
marker_names

Index(['CD3', 'CD4', 'CD8', 'CD45RA', 'CD56', 'CD16', 'CD10', 'CD11c', 'CD14',
       'CD19', 'CD34', 'CCR5', 'CCR7'],
      dtype='object')

In [41]:
# find exact matches first
marker_to_genes = {}
for marker in marker_names:
    marker_to_genes[marker] = []
    for gene_id in gene_ids:
        if gene_id == 'HUMAN_' + marker:
            marker_to_genes[marker].append(gene_id)

In [42]:
marker_to_genes

{'CCR5': ['HUMAN_CCR5'],
 'CCR7': ['HUMAN_CCR7'],
 'CD10': [],
 'CD11c': [],
 'CD14': ['HUMAN_CD14'],
 'CD16': [],
 'CD19': ['HUMAN_CD19'],
 'CD3': [],
 'CD34': ['HUMAN_CD34'],
 'CD4': ['HUMAN_CD4'],
 'CD45RA': [],
 'CD56': [],
 'CD8': []}

In [43]:
# for markers that didn't have an exact match, search for partial matches
partial_matches = {}
for marker in marker_to_genes.keys():
    if not marker_to_genes[marker]: # if the list is empty
        partial_matches[marker] = []
        for gene_id in gene_ids:
            if gene_id.find(marker) > -1:
                partial_matches[marker].append(gene_id)

In [44]:
partial_matches

{'CD10': ['HUMAN_CD101', 'HUMAN_CD109', 'HUMAN_PDCD10'],
 'CD11c': [],
 'CD16': ['HUMAN_CD160', 'HUMAN_CD163', 'HUMAN_CD163L1', 'HUMAN_CD164'],
 'CD3': ['HUMAN_ABCD3',
  'HUMAN_C2CD3',
  'HUMAN_CD300A',
  'HUMAN_CD300C',
  'HUMAN_CD300E',
  'HUMAN_CD300LB',
  'HUMAN_CD300LD',
  'HUMAN_CD300LF',
  'HUMAN_CD302',
  'HUMAN_CD320',
  'HUMAN_CD33',
  'HUMAN_CD34',
  'HUMAN_CD36',
  'HUMAN_CD37',
  'HUMAN_CD38',
  'HUMAN_CD3D',
  'HUMAN_CD3E',
  'HUMAN_CD3EAP',
  'HUMAN_CD3G',
  'HUMAN_NUDCD3',
  'HUMAN_PLCD3',
  'HUMAN_PTCD3',
  'HUMAN_SMARCD3'],
 'CD45RA': [],
 'CD56': [],
 'CD8': ['HUMAN_CD81',
  'HUMAN_CD82',
  'HUMAN_CD83',
  'HUMAN_CD84',
  'HUMAN_CD86',
  'HUMAN_CD8A',
  'HUMAN_CD8B']}

In [79]:
# Based on the partial matches, define a dictionary with correct mappings
# Also include results from a manual search for synonyms
additional_mappings = {\
    'CD3':['HUMAN_CD3D','HUMAN_CD3E','HUMAN_CD3EAP','HUMAN_CD3G'],\
    'CD8':['HUMAN_CD8A','HUMAN_CD8B'],\
    'CD10':['HUMAN_MME'],\
    'CD11c':['HUMAN_ITGAX'],\
    'CD16':['HUMAN_FCGR3A','HUMAN_FCGR3B'],\
    'CD45RA':['HUMAN_PTPRC'],\
    'CD56':['HUMAN_NCAM1']
    }

In [80]:
# combine the exact matches and the manually-inferred mappings
final_gene_mappings = marker_to_genes.copy()
for gene_id in additional_mappings:
    final_gene_mappings[gene_id] = additional_mappings[gene_id]

In [81]:
final_gene_mappings

{'CCR5': ['HUMAN_CCR5'],
 'CCR7': ['HUMAN_CCR7'],
 'CD10': ['HUMAN_MME'],
 'CD11c': ['HUMAN_ITGAX'],
 'CD14': ['HUMAN_CD14'],
 'CD16': ['HUMAN_FCGR3A', 'HUMAN_FCGR3B'],
 'CD19': ['HUMAN_CD19'],
 'CD3': ['HUMAN_CD3D', 'HUMAN_CD3E', 'HUMAN_CD3EAP', 'HUMAN_CD3G'],
 'CD34': ['HUMAN_CD34'],
 'CD4': ['HUMAN_CD4'],
 'CD45RA': ['HUMAN_PTPRC'],
 'CD56': ['HUMAN_NCAM1'],
 'CD8': ['HUMAN_CD8A', 'HUMAN_CD8B']}

In [85]:
result.

0.031343299977977974

In [86]:
# for each marker and associated gene_id, perform pearson correlation
# between the two.
correlation_results = {}
for marker in final_gene_mappings.keys():
    # get the value of marker gene from each cell
    marker_values = labels.loc[marker,]
    for gene_id in final_gene_mappings[marker]:
        gene_counts = counts.loc[gene_id,]
        result = stats.spearmanr(a=gene_counts, b=marker_values, axis=0)
        print(marker,gene_id,result)
        correlation_results[gene_id] = {'ADT_marker':marker,'correlation':result.correlation,'pvalue':result.pvalue}
        

CD8 HUMAN_CD8A SpearmanrResult(correlation=0.29397115374973276, pvalue=2.4173502593926323e-171)
CD8 HUMAN_CD8B SpearmanrResult(correlation=0.23940434544299147, pvalue=1.3790114474641316e-112)
CD4 HUMAN_CD4 SpearmanrResult(correlation=0.14495126756819499, pvalue=1.1128440454761501e-41)
CD10 HUMAN_MME SpearmanrResult(correlation=0.029175518788739703, pvalue=0.0067591620564750312)
CD56 HUMAN_NCAM1 SpearmanrResult(correlation=0.13419997884680551, pvalue=6.3932469655013789e-36)
CD11c HUMAN_ITGAX SpearmanrResult(correlation=0.21078745385343306, pvalue=3.8679515038378114e-87)
CCR7 HUMAN_CCR7 SpearmanrResult(correlation=0.1079040938314365, pvalue=9.7253502445044284e-24)
CD34 HUMAN_CD34 SpearmanrResult(correlation=0.17102933855357061, pvalue=1.4544562782908765e-57)
CD19 HUMAN_CD19 SpearmanrResult(correlation=0.17958120995540811, pvalue=2.2624339130823561e-63)
CD3 HUMAN_CD3D SpearmanrResult(correlation=0.62326263492201817, pvalue=0.0)
CD3 HUMAN_CD3E SpearmanrResult(correlation=0.6353651112400330

In [87]:
correlation_results_frame = pd.DataFrame(correlation_results)

Unnamed: 0,HUMAN_CCR5,HUMAN_CCR7,HUMAN_CD14,HUMAN_CD19,HUMAN_CD34,HUMAN_CD3D,HUMAN_CD3E,HUMAN_CD3EAP,HUMAN_CD3G,HUMAN_CD4,HUMAN_CD8A,HUMAN_CD8B,HUMAN_FCGR3A,HUMAN_FCGR3B,HUMAN_ITGAX,HUMAN_MME,HUMAN_NCAM1,HUMAN_PTPRC
ADT_marker,CCR5,CCR7,CD14,CD19,CD34,CD3,CD3,CD3,CD3,CD4,CD8,CD8,CD16,CD16,CD11c,CD10,CD56,CD45RA
correlation,0.0313433,0.107904,0.405775,0.179581,0.171029,0.623263,0.635365,0.0127207,0.484327,0.144951,0.293971,0.239404,0.490524,0.0876332,0.210787,0.0291755,0.1342,0.0525272
pvalue,0.00361634,9.72535e-24,0,2.26243e-63,1.45446e-57,0,0,0.237716,0,1.11284e-41,2.41735e-171,1.37901e-112,0,3.66038e-16,3.86795e-87,0.00675916,6.39325e-36,1.06815e-06


In [None]:
# save the correlation results
correlation_results_frame