In [1]:
from pathlib import Path

# Get the current working directory as a Path object
current_path = Path.cwd()
home_folder = 'evan_home'

# Traverse up the directory tree until you find the target folder
for parent in [current_path] + list(current_path.parents):
    if parent.name == home_folder:
        home_path = parent
        break
else:
    raise ValueError(f"Folder '{home_folder}' not found in the current working directory.")

print("Home Path:", home_path)
source_code_dir = home_path / 'Source_code'
dataset_dir = home_path / 'Dataset'


Home Path: c:\Users\evanlee\Documents\Bmi_NAS_evan\evan_home


In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import json
import os
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

In [3]:
### Read representative cells
# types = ['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']
# adata = sc.read_h5ad('/home/jovyan/work/Research_datasets/PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_L1_repcells_loginv_Harmony_noZ.h5ad')
# adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_L2_repcells_loginv_Harmony_noZ.h5ad')
adata = sc.read_h5ad(dataset_dir / 'PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_L2_repcells_loginv_Harmony_noZ.h5ad')

print('Original adata:', adata.shape)
label = adata.obs['celltype.l2'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)

Original adata: (57515, 27504)
all cell types: ['ASDC', 'B_intermediate', 'B_memory', 'B_naive', 'CD14_Mono', 'CD16_Mono', 'CD4_CTL', 'CD4_Naive', 'CD4_Proliferating', 'CD4_TCM', 'CD4_TEM', 'CD8_Naive', 'CD8_Proliferating', 'CD8_TCM', 'CD8_TEM', 'Doublet', 'Eryth', 'HSPC', 'ILC', 'MAIT', 'NK', 'NK_CD56bright', 'NK_Proliferating', 'Plasmablast', 'Platelet', 'Treg', 'cDC1', 'cDC2', 'dnT', 'gdT', 'pDC']


## Read PreLect features

In [4]:
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level2/feature_selection_k3')
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level2/feature_selection_k3')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df_k3 = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        features_dict[celltype] = feature_df_k3
    except:
        print('skipping:', celltype)
        continue

count_df_k3 = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df_k3 = features_dict[celltype]
    feature_count = feature_df_k3.shape[0]
    positive_count = feature_df_k3[feature_df_k3['Tendency'] == 1].shape[0]
    count_df_k3.loc[celltype] = [feature_count, positive_count]
count_df_k3

Unnamed: 0,Feature_count,Positive_feature_count
ASDC,47,25
B_intermediate,96,49
B_memory,170,90
B_naive,9,6
CD14_Mono,29,16
CD16_Mono,37,17
CD4_CTL,122,45
CD4_Naive,730,393
CD4_Proliferating,70,35
CD4_TCM,247,133


## Read ACT markers

In [5]:
# marker_df = pd.read_csv('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level2/ACT_annotation/Human_blood_marker_processed.csv')
marker_df = pd.read_csv(source_code_dir / 'PBMC_Hao_batch_noZ/Level2/ACT_annotation/Human_blood_marker_processed.csv')
print(marker_df.shape)
marker_df.head()

(147, 6)


Unnamed: 0,Species,Tissue,super_type,CellType,Marker,Resource
0,Human,Blood,B,Activated B cell,"CD5, MIR155HG, TAGLN2","35025971, 28910360, 2474446"
1,Human,Blood,T_CD4,"Activated CD4-positive, alpha-beta T cell","CD4, CCR6, CD14, CD19, CD38, CD3D, CD3G, CD69,...","35381424, 32783921, 35831277, 34529726, 103413..."
2,Human,Blood,T_CD4,"Activated CD4-positive, CD25-positive, CCR4-po...","CD3D, CD3E, CD3G, CD4, FOXP3",30977974
3,Human,Blood,,Adult endothelial progenitor cell,"KDR, PROM1, CD34, PTPRC, PECAM1, CD14","27561827, 29391882, 24641802, 29147957, 267700..."
4,Human,Blood,T,Alpha-beta T cell,"CD3D, CD3E, CD3G, IL7R","34284174, 28777444"


In [7]:
def get_celltype_marker(name, marker_df):
    string = marker_df[marker_df['CellType'] == name]['Marker'].tolist()[0]
    marker_set = set(string.split(', '))
    return marker_set

## Check if all_genes include ACT markers

In [8]:
all_genes = set(adata.var_names.tolist())
print(type(all_genes))
print(len(all_genes))

<class 'set'>
27504


In [9]:
cluster_2_ACT_name = {
    'B_memory': 'Memory B cell',
    'B_naive': 'Naive B cell',
    'CD14_Mono': 'Classical monocyte',
    'CD16_Mono': 'Non-classical monocyte',
    'CD4_Proliferating': 'Proliferating CD4-positive, alpha-beta T cell',
    'CD4_TCM': 'Central memory CD4-positive, alpha-beta T cell',
    'CD4_TEM': 'Effector memory CD4-positive, alpha-beta T cell',
    'CD8_Naive': 'Naive thymus-derived CD8-positive, alpha-beta T cell',
    'CD8_TEM': 'Effector memory CD8-positive, alpha-beta T cell',
    'cDC1': 'Type 1 conventional dendritic cell',
    'cDC2': 'Type 2 conventional dendritic cell',
    'gdT': 'Gamma-delta T cell',
    'HSPC': 'Hematopoietic stem cell',
    'MAIT': 'Mucosal invariant T cell',
    'pDC': 'Plasmacytoid dendritic cell',
    'Treg': 'Regulatory T cell',
    'CD4_CTL': 'CD4-positive, alpha-beta cytotoxic T cell',
    'CD4_Naive': 'Naive thymus-derived CD4-positive, alpha-beta T cell',
    'CD8_TCM': 'Central memory CD8-positive, alpha-beta T cell',
    'Plasmablast': 'Plasmablast'
}


In [12]:
for name in cluster_2_ACT_name.values():
    print(name)
    marker_set = get_celltype_marker(name, marker_df)
    print('Marker num:', len(marker_set))
    not_in_allgenes = marker_set - all_genes
    print('Excluded num:', len(not_in_allgenes))
    print(not_in_allgenes)


Memory B cell
Marker num: 17
Excluded num: 0
set()
Naive B cell
Marker num: 17
Excluded num: 1
{'IgD family'}
Classical monocyte
Marker num: 118
Excluded num: 9
{'HLA-DR family', 'BC013828', 'NCF1B', 'AL137655', 'LOC100133161', 'AX747598', 'SMA', 'SMA3', 'AK302511'}
Non-classical monocyte
Marker num: 51
Excluded num: 3
{'HLA-DR family', 'BC013828', 'EMR1'}
Proliferating CD4-positive, alpha-beta T cell
Marker num: 3
Excluded num: 0
set()
Central memory CD4-positive, alpha-beta T cell
Marker num: 26
Excluded num: 0
set()
Effector memory CD4-positive, alpha-beta T cell
Marker num: 21
Excluded num: 0
set()
Naive thymus-derived CD8-positive, alpha-beta T cell
Marker num: 11
Excluded num: 0
set()
Effector memory CD8-positive, alpha-beta T cell
Marker num: 34
Excluded num: 1
{'MMUT'}
Type 1 conventional dendritic cell
Marker num: 14
Excluded num: 0
set()
Type 2 conventional dendritic cell
Marker num: 7
Excluded num: 0
set()
Gamma-delta T cell
Marker num: 9
Excluded num: 0
set()
Hematopoietic 