#### Wrapper functions for scanpy's ``rank_genes_groups`` method
#### to facilitate streamlining of several batches of comparisons
---
hpb29

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os, sys, json, getpass
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import scanpy as sc

In [2]:
with open('/.singularity.d/labels.json') as fh:
    singularity = json.load(fh)
    
singularity['Version']

'metztli.25c'

In [3]:
now = datetime.now()
prefix = now.strftime('%Y%m%d')
print(prefix)

20210226


In [4]:
home = str(Path.home())
user = getpass.getuser()

basedir = os.path.join(home, 'datafloor/users', user, 'EL/Nicole/2020/SLX19841/')

sc.settings.writedir = os.path.join(basedir, 'analysis/h5ad/')

In [5]:
output_path='./output/'

In [6]:
sc.settings.verbosity = 3

In [7]:
def save_dex_results(adata, key, grouping, basename, outpath):
    
    statcols =['gene_name', 'scores', 'logfoldchanges','pvals', 'pvals_adj' ]
    
    groups = adata.obs[grouping].unique()   
    n = adata.shape[1]
    
    marker_genes_full = {}   
    for g in groups:
        marker_genes = np.reshape( np.array(adata.uns[key]['names'][g]), (n,1) )
       
        marker_genes_full.setdefault(g, None )
        marker_genes_full[g] = np.reshape( np.array(adata.uns[key]['names'][g]), (n,1) )
        marker_genes_full[g] = np.concatenate( [marker_genes_full[g], 
                                                   np.reshape(adata.uns[key]['scores'][g], (n,1) ) ], axis=1)
        marker_genes_full[g] = np.concatenate( [marker_genes_full[g], 
                                                   np.reshape(adata.uns[key]['logfoldchanges'][g], (n,1) ) ], axis=1)
        marker_genes_full[g] = np.concatenate( [marker_genes_full[g], 
                                                   np.reshape(adata.uns[key]['pvals'][g], (n,1) ) ], axis=1)
        marker_genes_full[g] = np.concatenate( [marker_genes_full[g], 
                                                   np.reshape(adata.uns[key]['pvals_adj'][g], (n,1) ) ], axis=1)   

    for c in marker_genes_full.keys():
        temp = pd.DataFrame(marker_genes_full[c], columns=statcols)   
        temp.to_csv( os.path.join(outpath,  
                    prefix+basename+key+'_group_'+c+'.txt'), 
                    index=None, sep='\t')

In [8]:
def export_top_n_diff_genes(adata, key, groups, n, basename, outpath):
  
    marker_genes = np.reshape( np.array(adata.uns[key]['names'][groups[0]][:n]  ), (n,1) )

    for c in groups[1:]:
        marker_genes = np.concatenate( [marker_genes, np.reshape(adata.uns[key]['names'][c][:n], (n,1) ) ], axis=1)

    markers_frame = pd.DataFrame( marker_genes, columns=groups )

    markers_frame.to_csv(os.path.join(outpath,
                         # note: prefix is being read as a global variable
                         prefix+basename+key+'_top'+str(n)+'_gene_markers.txt'), 
                         index=None, sep='\t')

In [9]:
def bundled_DEx_shenanigans(adata, group, key, bname, outpath, n_top_markers=100):
    
    # take note of hard-coded variables (and adjust if necessary)
    
    sc.tl.rank_genes_groups(adata, 
                        groupby=group,
                        key_added=key,
                        method='wilcoxon', 
                        tie_correct=True,     
                        use_raw=False)
    
    save_dex_results(adata, key, group, bname, outpath)
    
    targets = adata.obs[group].unique()
    export_top_n_diff_genes(adata, key, targets, n_top_markers, bname, outpath)

e.g.
- DEGs using Wilcoxon BM vs LD-PB 

  universe: combined cells from clusters 10, 12 and 2 



read data

In [10]:
reference = sc.read('COMBO10_NO_SPL3_Seurat3_lognorm')

annotate appropriately for desired comparison

In [16]:
dods = ['DOD1', 'DOD2', 'DOD3', 'DOD4']

reference.obs['state'] = reference.obs.apply(lambda x: 'DOD_'+x[5] if x[4] in dods else 'LD_'+x[5], 
                                             axis=1)

subset your target universe

In [17]:
selection = reference[np.logical_and(reference.obs['leiden.1.2'].isin(['10', '12', '2']),
                         reference.obs.state.isin(['DOD_BM', 'LD_PB']) ), :].copy()
selection.shape

(14452, 24332)

runs the comparisons

In [18]:
basename = '_COMBO10_WRS_cls_10_12_2_BM_LDPB_'

In [19]:
bundled_DEx_shenanigans(adata=selection, 
                        group='state', 
                        key='DEx_a',
                        bname=basename, 
                        outpath=output_path,
                        n_top_markers=100)

ranking genes


... storing 'state' as categorical


    finished: added to `.uns['DEx_a']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:01:06)
