In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import os

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc


## Count ADlasso features

In [3]:
adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/Stuart_GSE128639/Stuart_bm_v2.h5ad')
# adata.obs['celltype.l1'] = [s.replace(' ', '_') for s in adata.obs['celltype.l1']]
print(adata.shape)

label = adata.obs['celltype.l2'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)

(30011, 17009)
all cell types: ['CD14_Mono', 'CD16_Mono', 'CD4_Memory', 'CD4_Naive', 'CD56_bright_NK', 'CD8_Effector_1', 'CD8_Effector_2', 'CD8_Memory_1', 'CD8_Memory_2', 'CD8_Naive', 'GMP', 'HSC', 'LMPP', 'MAIT', 'Memory_B', 'NK', 'Naive_B', 'Plasmablast', 'Prog_B_1', 'Prog_B_2', 'Prog_DC', 'Prog_Mk', 'Prog_RBC', 'Treg', 'cDC2', 'gdT', 'pDC']


In [4]:
### Raw counts were not normalizaed
# CPM
sc.pp.normalize_total(adata, target_sum=1e6)

# log1p
sc.pp.log1p(adata)

## Build DEGn classifier for each celltype

In [5]:
# Find marker genes
def rank_genes(adata, all_types):
    sc.tl.rank_genes_groups(adata, groupby='celltype.l2', n_genes=1000, method='wilcoxon')

    # Get the top ranked genes for each celltype
    genes_df_dict = {}

    for celltype in all_types:
        genes_df = sc.get.rank_genes_groups_df(adata, group=celltype)
        genes_df_dict[celltype] = genes_df
    
    return genes_df_dict
    

In [6]:
DE_genes_dict = rank_genes(adata, types)

  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group_name, 'scores'] = scores[global_indices]
  self.stats[group_name, 'pvals'] = pvals[global_indices]
  self.stats[group_name, 'pvals_adj'] = pvals_adj[global_indices]
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group_name, 'scores'] = scores[global_indices]
  self.stats[group_name, 'pvals'] = pvals[global_indices]
  self.stats[group_name, 'pvals_adj'] = pvals_adj[global_indices]
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group_name, 'scores'] = scores[global_indices]
  self.stats[group_name, 'pvals'] = pvals[global_indices]
  self.stats[group_name, 'pvals_adj'] = pvals_adj[global_indices]
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group

In [7]:
DE_genes_dict.keys()

dict_keys(['CD14_Mono', 'CD16_Mono', 'CD4_Memory', 'CD4_Naive', 'CD56_bright_NK', 'CD8_Effector_1', 'CD8_Effector_2', 'CD8_Memory_1', 'CD8_Memory_2', 'CD8_Naive', 'GMP', 'HSC', 'LMPP', 'MAIT', 'Memory_B', 'NK', 'Naive_B', 'Plasmablast', 'Prog_B_1', 'Prog_B_2', 'Prog_DC', 'Prog_Mk', 'Prog_RBC', 'Treg', 'cDC2', 'gdT', 'pDC'])

In [9]:
DE_genes_dict['CD14_Mono'].head()

Unnamed: 0,names,scores,logfoldchanges,pvals,pvals_adj
0,FTL,118.320229,3.209847,0.0,0.0
1,S100A9,116.987717,10.048418,0.0,0.0
2,TYROBP,116.837776,9.611924,0.0,0.0
3,LYZ,115.78083,9.971212,0.0,0.0
4,S100A8,115.327438,9.830252,0.0,0.0


In [12]:
os.chdir('/Users/evanli/Documents/EvanPys/Progress/Stuart_bm/Level2_onlylog1p/L2_DEG/DEG_table')
# export DEG list
for celltype in DE_genes_dict.keys():
    DEG_df = DE_genes_dict[celltype]
    DEG_df.to_csv(f'{celltype}_DEG1000.csv')