In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import os

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc


## Count ADlasso features

In [3]:
adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/Hao_PBMC_GSE164378.h5ad')
adata.obs['celltype.l1'] = [s.replace(' ', '_') for s in adata.obs['celltype.l1']]
print(adata.shape)

label = adata.obs['celltype.l1'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)

(161764, 33538)
all cell types: ['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']


In [4]:
### Raw counts were not normalizaed
# CPM
sc.pp.normalize_total(adata, target_sum=1e6)

# log1p
sc.pp.log1p(adata)

## Build DEGn classifier for each celltype

In [5]:
# Find marker genes
def rank_genes(adata, all_types):
    sc.tl.rank_genes_groups(adata, groupby='celltype.l1', n_genes=2000, method='wilcoxon')

    # Get the top ranked genes for each celltype
    genes_df_dict = {}

    for celltype in all_types:
        genes_df = sc.get.rank_genes_groups_df(adata, group=celltype)
        genes_df_dict[celltype] = genes_df
    
    return genes_df_dict
    

In [6]:
DE_genes_dict = rank_genes(adata, types)

In [7]:
DE_genes_dict.keys()

dict_keys(['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T'])

In [12]:
DE_genes_dict['DC'].head()

Unnamed: 0,names,scores,logfoldchanges,pvals,pvals_adj
0,CD74,96.476372,5.477156,0.0,0.0
1,HLA-DPA1,92.968788,6.485485,0.0,0.0
2,HLA-DPB1,92.936043,6.250789,0.0,0.0
3,HLA-DQA1,92.403793,8.195508,0.0,0.0
4,HLA-DRA,91.820473,6.938076,0.0,0.0


In [13]:
!pwd

/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_GSE/L1_by_lossdiff/L1_DEG


In [24]:
# export DEG list
for celltype in DE_genes_dict.keys():
    DEG_df = DE_genes_dict[celltype]
    DEG_df.to_csv(f'{celltype}_DEG2000.csv')