In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import os

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc


## Count ADlasso features

In [7]:
adata = sc.read_h5ad('/home/jovyan/work/Research_datasets/Hao_PBMC.h5ad')
adata.obs['celltype.l2'] = [s.replace(' ', '_') for s in adata.obs['celltype.l2']]

label = adata.obs['celltype.l2'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)

sc.pp.normalize_total(adata, target_sum=1e6)
sc.pp.log1p(adata)
print(adata.shape)


all cell types: ['ASDC', 'B_intermediate', 'B_memory', 'B_naive', 'CD14_Mono', 'CD16_Mono', 'CD4_CTL', 'CD4_Naive', 'CD4_Proliferating', 'CD4_TCM', 'CD4_TEM', 'CD8_Naive', 'CD8_Proliferating', 'CD8_TCM', 'CD8_TEM', 'Doublet', 'Eryth', 'HSPC', 'ILC', 'MAIT', 'NK', 'NK_CD56bright', 'NK_Proliferating', 'Plasmablast', 'Platelet', 'Treg', 'cDC1', 'cDC2', 'dnT', 'gdT', 'pDC']
(161764, 20568)


In [10]:
os.chdir('/home/jovyan/work/GitHub/EvanPys/Progress/PBMC_Hao/Level2_pvl0_ttsplit/lambda_decision_ignore_section/L2_feature_selection')
# C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao\Level2_pvl0_ttsplit\lambda_decision_ignore_section\L2_feature_selection

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        features_dict[celltype] = feature_df
    except:
        print('skipping:', celltype)
        continue
    # print(celltype, 'Feature count:', feature_df.shape[0])
    # print(celltype, 'Positive feature count:', feature_df[feature_df['Tendency'] == 1].shape[0])
    # print('------------------')

count_df = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df = features_dict[celltype]
    feature_count = feature_df.shape[0]
    positive_count = feature_df[feature_df['Tendency'] == 1].shape[0]
    count_df.loc[celltype] = [feature_count, positive_count]
count_df.head()

Unnamed: 0,Feature_count,Positive_feature_count
ASDC,10,6
B_intermediate,11,8
B_memory,20,12
B_naive,21,12
CD14_Mono,15,7


In [11]:
count_df.shape

(31, 2)

## Build DEGn classifier for each celltype

In [12]:
# Find marker genes
def rank_genes(adata, all_types):
    sc.tl.rank_genes_groups(adata, groupby='celltype.l2', n_genes=2000, method='wilcoxon')

    # Get the top ranked genes for each celltype
    genes_df_dict = {}

    for celltype in all_types:
        genes_df = sc.get.rank_genes_groups_df(adata, group=celltype)
        genes_df_dict[celltype] = genes_df
    
    return genes_df_dict
    

In [13]:
DE_genes_dict = rank_genes(adata, types)



  foldchanges = (self.expm1_func(mean_group) + 1e-9) / (
  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group_name, 'scores'] = scores[global_indices]
  self.stats[group_name, 'pvals'] = pvals[global_indices]
  self.stats[group_name, 'pvals_adj'] = pvals_adj[global_indices]
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group_name, 'scores'] = scores[global_indices]
  self.stats[group_name, 'pvals'] = pvals[global_indices]
  self.stats[group_name, 'pvals_adj'] = pvals_adj[global_indices]
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group_name, 'scores'] = scores[global_indices]
  self.stats[group_name, 'pvals'] = pvals[global_indices]
  self.stats[group_name, 'pvals_adj'] = pvals_adj[global_indices]
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'na

In [14]:
DE_genes_dict.keys()

dict_keys(['ASDC', 'B_intermediate', 'B_memory', 'B_naive', 'CD14_Mono', 'CD16_Mono', 'CD4_CTL', 'CD4_Naive', 'CD4_Proliferating', 'CD4_TCM', 'CD4_TEM', 'CD8_Naive', 'CD8_Proliferating', 'CD8_TCM', 'CD8_TEM', 'Doublet', 'Eryth', 'HSPC', 'ILC', 'MAIT', 'NK', 'NK_CD56bright', 'NK_Proliferating', 'Plasmablast', 'Platelet', 'Treg', 'cDC1', 'cDC2', 'dnT', 'gdT', 'pDC'])

In [15]:
DE_genes_dict['ASDC']

Unnamed: 0,names,scores,logfoldchanges,pvals,pvals_adj
0,ENSG00000166428,14.892009,12.159752,3.714410e-50,7.639799e-46
1,ENSG00000167641,14.828171,16.513487,9.632159e-50,9.905712e-46
2,ENSG00000196628,14.780492,19.565844,1.957309e-49,1.323210e-45
3,ENSG00000235162,14.762052,15.929845,2.573337e-49,1.323210e-45
4,ENSG00000101608,14.684780,27.117870,8.069543e-49,3.319487e-45
...,...,...,...,...,...
1995,ENSG00000257103,4.886111,1.621129,1.028474e-06,1.043978e-05
1996,ENSG00000166848,4.886038,1.470130,1.028853e-06,1.043978e-05
1997,ENSG00000089327,4.885623,1.771358,1.031023e-06,1.045665e-05
1998,ENSG00000060491,4.885075,1.854394,1.033895e-06,1.048060e-05


In [16]:

celltype = 'ASDC'
n_features = count_df.loc[celltype, 'Feature_count']
DEGn = DE_genes_dict[celltype]['names'][:n_features].tolist()

# subset adata to only include DEGs
# adata_DEGn = adata[:, DEGn]


# construct LR classifier
# subset data to DEGs
X = adata[:, DEGn].X
print(f'{celltype} DEGn adata shape:', X.shape)
# Binary label
y = [1 if i==celltype else 0 for i in adata.obs['celltype.l2'].tolist()]



ASDC DEGn adata shape: (161764, 10)


In [17]:
DEGn

['ENSG00000166428',
 'ENSG00000167641',
 'ENSG00000196628',
 'ENSG00000235162',
 'ENSG00000101608',
 'ENSG00000135916',
 'ENSG00000106803',
 'ENSG00000152492',
 'ENSG00000116288',
 'ENSG00000140968']

In [18]:
def LR_classifier(data, DEGn, celltype):
    print('==================')
    print('Constructing classifier for:', celltype)
    # subset data to DEGs
    X = data[:, DEGn].X
    print(X.shape)
    # Binary label
    y = [1 if i==celltype else 0 for i in data.obs['celltype.l2'].tolist()]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
    
    # train classifier
    clf = LogisticRegression(penalty='l2', solver='lbfgs', C=1.0)
    clf.fit(X_train, y_train)

    # evaluate classifier
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    precision = precision_score(y_test, y_pred)  # average='macro'
    print("Precision: {:.2f}%".format(precision * 100))
    f1 = f1_score(y_test, y_pred)
    print("F1: {:.2f}%".format(f1 * 100))
    # AUC-ROC
    y_pred_prob = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    print('ROC-AUC:', roc_auc)

    # model metrics
    metrics = [accuracy, precision, f1, roc_auc]

    return clf, metrics, y_test, y_pred

In [19]:
clf_AS, metrics_AS, y_t, y_p = LR_classifier(adata, DEGn, celltype)

Constructing classifier for: ASDC
(161764, 10)
Accuracy: 99.97%
Precision: 100.00%
F1: 50.00%
ROC-AUC: 0.9996639660255221


In [20]:
np.unique(y_t)

array([0, 1])

In [21]:
np.unique(y_p)

array([0, 1])

In [22]:
!pwd

/home/jovyan/work/GitHub/EvanPys/Progress/PBMC_Hao/Level2_pvl0_ttsplit/lambda_decision_ignore_section/L2_feature_selection


In [23]:
os.chdir('/home/jovyan/work/GitHub/EvanPys/Progress/PBMC_Hao/Classifier_evaluation')

In [24]:
# export DEG list
for celltype in DE_genes_dict.keys():
    DEG_df = DE_genes_dict[celltype]
    DEG_df.to_csv(f'{celltype}_DEG2000.csv')