In [1]:
from pathlib import Path

# Get the current working directory as a Path object
current_path = Path.cwd()
home_folder = 'evan_home'

# Traverse up the directory tree until you find the target folder
for parent in [current_path] + list(current_path.parents):
    if parent.name == home_folder:
        home_path = parent
        break
else:
    raise ValueError(f"Folder '{home_folder}' not found in the current working directory.")

print("Home Path:", home_path)
source_code_dir = home_path / 'Source_code'
dataset_dir = home_path / 'Dataset'

Home Path: c:\Users\evanlee\Documents\Bmi_NAS_evan\evan_home


In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import json
import os
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

In [3]:
### Read representative cells
# types = ['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']
# adata = sc.read_h5ad('/home/jovyan/work/Research_datasets/PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_L1_repcells_loginv_Harmony_noZ.h5ad')
# adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_L1_repcells_loginv_Harmony_noZ.h5ad')
adata = sc.read_h5ad(dataset_dir / 'PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_L1_repcells_loginv_Harmony_noZ.h5ad')
print('Original adata:', adata.shape)
label = adata.obs['celltype.l1'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)

Original adata: (59897, 27504)
all cell types: ['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']


## Read PreLect features

In [3]:
# types = ['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']

In [4]:
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level1/feature_selection_k3')
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/feature_selection_k3')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df_k3 = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        features_dict[celltype] = feature_df_k3
    except:
        print('skipping:', celltype)
        continue

count_df_k3 = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df_k3 = features_dict[celltype]
    feature_count = feature_df_k3.shape[0]
    positive_count = feature_df_k3[feature_df_k3['Tendency'] == 1].shape[0]
    count_df_k3.loc[celltype] = [feature_count, positive_count]
count_df_k3

Unnamed: 0,Feature_count,Positive_feature_count
B,19,10
CD4_T,201,95
CD8_T,23,9
DC,50,23
Mono,50,20
NK,33,17
other,5,3
other_T,247,112


## Read ACT markers

In [5]:
# marker_df = pd.read_csv('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level1/ACT_annotation/Human_blood_marker_processed.csv')
marker_df = pd.read_csv(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/ACT_annotation/Human_blood_marker_processed.csv')
print(marker_df.shape)
marker_df.head()

(147, 6)


Unnamed: 0,Species,Tissue,super_type,CellType,Marker,Resource
0,Human,Blood,B,Activated B cell,"CD5, MIR155HG, TAGLN2","35025971, 28910360, 2474446"
1,Human,Blood,T_CD4,"Activated CD4-positive, alpha-beta T cell","CD4, CCR6, CD14, CD19, CD38, CD3D, CD3G, CD69,...","35381424, 32783921, 35831277, 34529726, 103413..."
2,Human,Blood,T_CD4,"Activated CD4-positive, CD25-positive, CCR4-po...","CD3D, CD3E, CD3G, CD4, FOXP3",30977974
3,Human,Blood,,Adult endothelial progenitor cell,"KDR, PROM1, CD34, PTPRC, PECAM1, CD14","27561827, 29391882, 24641802, 29147957, 267700..."
4,Human,Blood,T,Alpha-beta T cell,"CD3D, CD3E, CD3G, IL7R","34284174, 28777444"


In [6]:
def get_celltype_marker(name, marker_df):
    string = marker_df[marker_df['CellType'] == name]['Marker'].tolist()[0]
    marker_set = set(string.split(', '))
    return marker_set

## Check if all_genes include ACT markers

In [7]:
all_genes = set(adata.var_names.tolist())
print(type(all_genes))
print(len(all_genes))

<class 'set'>
27504


In [8]:
ACT_name = ['B cell', 'CD4-positive, alpha-beta T cell', 'CD8-positive, alpha-beta T cell', 'Dendritic cell', 'Monocyte', 'Natural killer cell']
cluster_2_ACT_name = dict(zip(types, ACT_name))
cluster_2_ACT_name

{'B': 'B cell',
 'CD4_T': 'CD4-positive, alpha-beta T cell',
 'CD8_T': 'CD8-positive, alpha-beta T cell',
 'DC': 'Dendritic cell',
 'Mono': 'Monocyte',
 'NK': 'Natural killer cell'}

In [None]:
# is in Hao all genes?
'SIGLEC2' in all_genes

False

In [None]:
# is in Hao all genes?
'CD22' in all_genes

True

In [14]:
# is in ACT markers?
'SIGLEC2' in get_celltype_marker('Dendritic cell', marker_df)

True

In [None]:
# is in ACT markers?
'CD22' in get_celltype_marker('Dendritic cell', marker_df)

True

In [12]:
# print genes that are in ACT markers but not in Hao all genes
for name in ACT_name:
    print(name)
    marker_set = get_celltype_marker(name, marker_df)
    print('Marker num:', len(marker_set))
    not_in_allgenes = marker_set - all_genes
    print('Excluded num:', len(not_in_allgenes))
    print(not_in_allgenes)


B cell
Marker num: 59
Excluded num: 1
{'IgD family'}
CD4-positive, alpha-beta T cell
Marker num: 41
Excluded num: 0
set()
CD8-positive, alpha-beta T cell
Marker num: 26
Excluded num: 0
set()
Dendritic cell
Marker num: 49
Excluded num: 2
{'MHC class II', 'SIGLEC2'}
Monocyte
Marker num: 113
Excluded num: 9
{'TRB', 'FCGR2C', 'H1-3', 'COX2', 'HLA-DR family', 'TRA', 'IgG family', 'HLA-DRB3', 'HLA-DRB4'}
Natural killer cell
Marker num: 74
Excluded num: 1
{'CCL3L3'}


## Alias dictionary

In [None]:
# {ACT_marker_alias: Hao_all_genes_alias}
alias_dict = {'SIGLEC2': ['CD22'], 
             'IgD family': ['IGHD'], 
             'MHC class II': [g for g in all_genes if g.startswith('HLA-D')], 
             'HLA-DR family': ['HLA-DRB1', 'HLA-DRA', 'HLA-DRB5'], 
             'COX2': ['PTGS2'], 
             'H1-3': ['HIST1H1D'], 
             'EMR1': ['ADGRE1'], 
             'MMUT': ['MUT'], 
             'IGJ': ['JCHAIN'], 
             }
alias_dict

{'SIGLEC2': ['CD22'],
 'IgD family': ['IGHD'],
 'MHC class II': ['HLA-DRB1',
  'HLA-DQA1',
  'HLA-DQA2',
  'HLA-DRA',
  'HLA-DQB2',
  'HLA-DMB',
  'HLA-DOB',
  'HLA-DQB1',
  'HLA-DPA1',
  'HLA-DPB1',
  'HLA-DMA',
  'HLA-DRB5',
  'HLA-DQB1-AS1',
  'HLA-DOA'],
 'HLA-DR family': ['HLA-DRB1', 'HLA-DRA', 'HLA-DRB5'],
 'COX2': ['PTGS2'],
 'H1-3': ['HIST1H1D'],
 'EMR1': ['ADGRE1'],
 'MMUT': ['MUT'],
 'IGJ': ['JCHAIN']}