In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import os

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc


## Count ADlasso features

In [3]:
adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/Stuart_GSE128639/Stuart_bm_v2.h5ad')
# adata.obs['celltype.l1'] = [s.replace(' ', '_') for s in adata.obs['celltype.l1']]
print(adata.shape)

label = adata.obs['celltype.l1'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)

(30011, 17009)
all cell types: ['B_cell', 'Mono&DC', 'NK', 'Progenitor_cells', 'T_cell']


In [4]:
### Raw counts were not normalizaed
# CPM
sc.pp.normalize_total(adata, target_sum=1e6)

# log1p
sc.pp.log1p(adata)

## Build DEGn classifier for each celltype

In [5]:
# Find marker genes
def rank_genes(adata, all_types):
    sc.tl.rank_genes_groups(adata, groupby='celltype.l1', n_genes=500, method='wilcoxon')

    # Get the top ranked genes for each celltype
    genes_df_dict = {}

    for celltype in all_types:
        genes_df = sc.get.rank_genes_groups_df(adata, group=celltype)
        genes_df_dict[celltype] = genes_df
    
    return genes_df_dict
    

In [6]:
DE_genes_dict = rank_genes(adata, types)

In [7]:
DE_genes_dict.keys()

dict_keys(['B_cell', 'Mono&DC', 'NK', 'Progenitor_cells', 'T_cell'])

In [8]:
DE_genes_dict['B_cell'].head()

Unnamed: 0,names,scores,logfoldchanges,pvals,pvals_adj
0,CD74,89.212715,7.011836,0.0,0.0
1,HLA-DRA,81.584816,7.234094,0.0,0.0
2,CD79A,79.148575,10.078256,0.0,0.0
3,HLA-DPB1,78.833336,7.149742,0.0,0.0
4,MS4A1,75.038307,10.181658,0.0,0.0


In [11]:
!pwd

/Users/evanli/Documents/EvanPys/Progress/Stuart_bm/Level1_onlylog1p/L1_DEG


In [12]:
# export DEG list
for celltype in DE_genes_dict.keys():
    DEG_df = DE_genes_dict[celltype]
    DEG_df.to_csv(f'{celltype}_DEG500.csv')