In [1]:
from pathlib import Path

# Get the current working directory as a Path object
current_path = Path.cwd()
home_folder = 'evan_home'

# Traverse up the directory tree until you find the target folder
for parent in [current_path] + list(current_path.parents):
    if parent.name == home_folder:
        home_path = parent
        break
else:
    raise ValueError(f"Folder '{home_folder}' not found in the current working directory.")

print("Home Path:", home_path)
source_code_dir = home_path / 'Source_code'
dataset_dir = home_path / 'Dataset'


Home Path: c:\Users\evanlee\Documents\Bmi_NAS_evan\evan_home


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import os
from matplotlib_venn import venn2


In [3]:
marker_df = pd.read_csv('Human_blood_marker_processed.csv')
print(marker_df.shape)
marker_df.head()

(147, 6)


Unnamed: 0,Species,Tissue,super_type,CellType,Marker,Resource
0,Human,Blood,B,Activated B cell,"CD5, MIR155HG, TAGLN2","35025971, 28910360, 2474446"
1,Human,Blood,T_CD4,"Activated CD4-positive, alpha-beta T cell","CD4, CCR6, CD14, CD19, CD38, CD3D, CD3G, CD69,...","35381424, 32783921, 35831277, 34529726, 103413..."
2,Human,Blood,T_CD4,"Activated CD4-positive, CD25-positive, CCR4-po...","CD3D, CD3E, CD3G, CD4, FOXP3",30977974
3,Human,Blood,,Adult endothelial progenitor cell,"KDR, PROM1, CD34, PTPRC, PECAM1, CD14","27561827, 29391882, 24641802, 29147957, 267700..."
4,Human,Blood,T,Alpha-beta T cell,"CD3D, CD3E, CD3G, IL7R","34284174, 28777444"


In [4]:
marker_df['super_type'].unique()

array(['B', 'T_CD4', nan, 'T', 'Monocyte', 'DC', 'NK', 'T_CD4_cytotoxic',
       'T_CD4_memory', 'T_CD8_cytotoxic', 'T_CD8_memory', 'T_CD8',
       'T_CD4_Tcm', 'T_CD8_Tcm', 'T_cm', 'B_memory', 'T_CD4_Tem', 'T_em',
       'T_CD8_Tem', 'Erythrocyte', 'T_helper', 'T_reg', 'T_memory',
       'B_naive', 'Platelet'], dtype=object)

In [5]:
marker_df['CellType'].unique()

array(['Activated B cell', 'Activated CD4-positive, alpha-beta T cell',
       'Activated CD4-positive, CD25-positive, CCR4-positive, alpha-beta regulatory T cell, human',
       'Adult endothelial progenitor cell', 'Alpha-beta T cell',
       'Alternatively activated macrophage', 'Atypical B cell', 'B cell',
       'Basophil', 'Cardiac muscle cell',
       'CD14-low, CD16-positive monocyte', 'CD14-positive monocyte',
       'CD14-positive, CD16-negative classical monocyte',
       'CD14-positive, CD16-positive monocyte',
       'CD141-positive, CLEC9A-positive dendritic cell',
       'CD16-negative natural killer cell',
       'CD16-negative, CD56-bright natural killer cell, human',
       'CD16-positive myeloid dendritic cell',
       'CD16-positive natural killer cell',
       'CD16-positive, CD56-dim natural killer cell, human',
       'CD1c-positive myeloid dendritic cell', 'CD38-positive B cell',
       'CD4-intermediate, CD8-positive double-positive thymocyte',
       'CD4-posit

## Read PreLect features

In [6]:
types = ['ASDC', 'B_intermediate', 'B_memory', 'B_naive', 'CD14_Mono', 'CD16_Mono', 'CD4_CTL', 'CD4_Naive', 'CD4_Proliferating', 'CD4_TCM', 'CD4_TEM', 'CD8_Naive', 'CD8_Proliferating', 'CD8_TCM', 'CD8_TEM', 'Doublet', 'Eryth', 'HSPC', 'ILC', 'MAIT', 'NK', 'NK_CD56bright', 'NK_Proliferating', 'Plasmablast', 'Platelet', 'Treg', 'cDC1', 'cDC2', 'dnT', 'gdT', 'pDC']

In [7]:
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level2/feature_selection_k3')
# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level2\feature_selection_k3")
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level2/feature_selection_k3')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df_k3 = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        features_dict[celltype] = feature_df_k3
    except:
        print('skipping:', celltype)
        continue

count_df_k3 = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df_k3 = features_dict[celltype]
    feature_count = feature_df_k3.shape[0]
    positive_count = feature_df_k3[feature_df_k3['Tendency'] == 1].shape[0]
    count_df_k3.loc[celltype] = [feature_count, positive_count]
count_df_k3

Unnamed: 0,Feature_count,Positive_feature_count
ASDC,47,25
B_intermediate,96,49
B_memory,170,90
B_naive,9,6
CD14_Mono,29,16
CD16_Mono,37,17
CD4_CTL,122,45
CD4_Naive,730,393
CD4_Proliferating,70,35
CD4_TCM,247,133


In [7]:
# Do two comparisons:
#   1. All_PreLect vs. Marker in database
#   2. Positive_PreLect vs. Marker in database
def compare_PreLect_w_marker(celltype, PreLect_dict, marker_genes, name=''):
    if name == '':
        name = celltype
    # Extracting the features and marker genes
    feature_df = PreLect_dict[celltype]
    features = feature_df['Gene'].tolist()
    positve_features = feature_df[feature_df['Tendency'] == 1]['Gene'].tolist()
    # marker_genes = marker_df['official gene symbol'].tolist()

    ### All features
    # Finding common elements
    common_all = list(set(features) & set(marker_genes))
    print(celltype, 'All features common with markers:', len(common_all))
    # Preparing sets for Venn diagram
    features_set = set(features)
    marker_genes_set = set(marker_genes)
    # Plotting Venn diagram
    plt.figure(figsize=(8, 5))
    v = venn2([features_set, marker_genes_set], set_labels=('PreLect all', f'{name} Marker'))
    for idx in ('10', '01', '11'):  # Each region in a 2-set Venn diagram
        if v.get_label_by_id(idx) and int(v.get_label_by_id(idx).get_text()) == 0:
            v.get_label_by_id(idx).set_text('')
    plt.title(name, fontsize=16)
    for text in v.set_labels:
        text.set_fontsize(14)
    for text in v.subset_labels:
        text.set_fontsize(16)
    plt.savefig(f"{name}_PreLect_all_vs_marker.png")
    plt.close()
    # plt.show()

    ### Positive features
    # Finding common elements
    common_positive = list(set(positve_features) & set(marker_genes))
    print(celltype, 'Positive features common with markers:', len(common_positive))
    # Preparing sets for Venn diagram
    positive_features_set = set(positve_features)
    marker_genes_set = set(marker_genes)
    # Plotting Venn diagram
    plt.figure(figsize=(8, 5))
    v = venn2([positive_features_set, marker_genes_set], set_labels=('PreLect positive', f'{name} Marker'), set_colors=('blue', 'green'))
    for idx in ('10', '01', '11'):  # Each region in a 2-set Venn diagram
        if v.get_label_by_id(idx) and int(v.get_label_by_id(idx).get_text()) == 0:
            v.get_label_by_id(idx).set_text('')
    plt.title(name, fontsize=16)
    for text in v.set_labels:
        print(text)
        text.set_fontsize(14)
    for text in v.subset_labels:
        text.set_fontsize(16)
    plt.savefig(f"{name}_PreLect_positive_vs_marker.png")
    plt.close()
    # plt.show()

    return common_all, common_positive


In [8]:
def get_celltype_marker(name, marker_df):
    string = marker_df[marker_df['CellType'] == name]['Marker'].tolist()[0]
    marker_list = string.split(', ')
    return marker_list

## Function for running comparison

In [9]:
def run_compare(clus_name, act_name, features_dict, marker_df, marker_list=None):
    if marker_list:
        common, common_p = compare_PreLect_w_marker(clus_name, features_dict, marker_list, name=clus_name)
    else:
        common, common_p = compare_PreLect_w_marker(clus_name, features_dict, get_celltype_marker(act_name, marker_df), name=clus_name)
    print(f'{clus_name}_common:', common)
    print(f'{clus_name}_common_p:', common_p)

    return common, common_p

In [10]:
clus_ACT_name_dict = {
    # 20 types
    'B_memory': 'Memory B cell',
    'B_naive': 'Naive B cell',
    'CD14_Mono': 'Classical monocyte',
    'CD16_Mono': 'Non-classical monocyte',
    'CD4_Proliferating': 'Proliferating CD4-positive, alpha-beta T cell',
    'CD4_TCM': 'Central memory CD4-positive, alpha-beta T cell',
    'CD4_TEM': 'Effector memory CD4-positive, alpha-beta T cell',
    'CD8_Naive': 'Naive thymus-derived CD8-positive, alpha-beta T cell',
    'CD8_TEM': 'Effector memory CD8-positive, alpha-beta T cell',
    'cDC1': 'Type 1 conventional dendritic cell',
    'cDC2': 'Type 2 conventional dendritic cell',
    'gdT': 'Gamma-delta T cell',
    'HSPC': 'Hematopoietic stem cell',
    'MAIT': 'Mucosal invariant T cell',
    'pDC': 'Plasmacytoid dendritic cell',
    'Treg': 'Regulatory T cell',
    'CD4_CTL': 'CD4-positive, alpha-beta cytotoxic T cell',
    'CD4_Naive': 'Naive thymus-derived CD4-positive, alpha-beta T cell',
    'CD8_TCM': 'Central memory CD8-positive, alpha-beta T cell',
    'Plasmablast': 'Plasmablast',
    # added
    'NK': 'Natural killer cell', 
    'NK_CD56bright': 'CD16-negative, CD56-bright natural killer cell, human'
}


## Run

In [None]:
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level2/ACT_annotation/venn_plot')
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level2/ACT_annotation/venn_plot')
overlapped_dict = {}
for clus, name in clus_ACT_name_dict.items():
    common, common_positive = run_compare(clus, name, features_dict, marker_df)
    overlapped_dict[clus] = common

B_memory All features common with markers: 5
B_memory Positive features common with markers: 4
Text(-0.09736495539698528, -0.5652145942620618, 'PreLect positive')
Text(0.5154615285722748, -0.26703827784280404, 'B_memory Marker')
B_memory_common: ['IGHG3', 'IGHM', 'MS4A1', 'IGHG1', 'AIM2']
B_memory_common_p: ['IGHG1', 'IGHG3', 'MS4A1', 'AIM2']
B_naive All features common with markers: 4
B_naive Positive features common with markers: 4
Text(-0.3084798873361627, -0.359583030112296, 'PreLect positive')
Text(0.1088752543539398, -0.5762057736504398, 'B_naive Marker')
B_naive_common: ['IGHM', 'TCL1A', 'MS4A1', 'IGHD']
B_naive_common_p: ['IGHM', 'TCL1A', 'MS4A1', 'IGHD']
CD14_Mono All features common with markers: 9
CD14_Mono Positive features common with markers: 9
Text(-0.4541203771241833, -0.23935136658660686, 'PreLect positive')
Text(0.06157564435582141, -0.585665459563262, 'CD14_Mono Marker')
CD14_Mono_common: ['S100A8', 'MS4A6A', 'NEAT1', 'CD14', 'VCAN', 'FCN1', 'LYZ', 'S100A9', 'CEBPD']

## Export overlapped marker dict

In [14]:
overlapped_dict

{'B_memory': ['IGHG1', 'IGHG3', 'IGHM', 'AIM2', 'MS4A1'],
 'B_naive': ['IGHM', 'IGHD', 'MS4A1', 'TCL1A'],
 'CD14_Mono': ['LYZ',
  'NEAT1',
  'VCAN',
  'S100A8',
  'MS4A6A',
  'CD14',
  'S100A9',
  'CEBPD',
  'FCN1'],
 'CD16_Mono': ['CST3', 'AIF1', 'LST1', 'CDKN1C', 'FCER1G', 'TCF7L2', 'FCGR3A'],
 'CD4_Proliferating': ['MKI67', 'STMN1', 'TUBA1B'],
 'CD4_TCM': ['MAL',
  'CD69',
  'TRAC',
  'IL32',
  'TMSB10',
  'AQP3',
  'SELL',
  'CD3D',
  'LDHB',
  'LTB',
  'CD3G',
  'ITGB1',
  'ZFP36L2',
  'CD4',
  'KLF2',
  'CD3E',
  'ANXA1',
  'IL7R'],
 'CD4_TEM': ['GZMA',
  'KLRB1',
  'GNLY',
  'CD3G',
  'ITGB1',
  'CD4',
  'PTPRC',
  'GZMK',
  'IL7R',
  'CCL5'],
 'CD8_Naive': ['CD8A',
  'CCR7',
  'SELL',
  'CD8B',
  'CD27',
  'CD3E',
  'CD3D',
  'IL7R',
  'LEF1'],
 'CD8_TEM': ['NKG7',
  'TRAC',
  'TRGC2',
  'CD8A',
  'GZMA',
  'KLRD1',
  'GNLY',
  'CST7',
  'CD8B',
  'CD3G',
  'GZMH',
  'CD3D',
  'PRF1',
  'GZMK',
  'CCL5'],
 'cDC1': ['BATF3', 'HLA-DPA1', 'CLEC9A'],
 'cDC2': ['CLEC10A', 'FCER1A'],

In [15]:
for k, v in overlapped_dict.items():
    print(k, len(v))

B_memory 5
B_naive 4
CD14_Mono 9
CD16_Mono 7
CD4_Proliferating 3
CD4_TCM 18
CD4_TEM 10
CD8_Naive 9
CD8_TEM 15
cDC1 3
cDC2 2
gdT 5
HSPC 2
MAIT 6
pDC 4
Treg 11
CD4_CTL 3
CD4_Naive 12
CD8_TCM 6
Plasmablast 3
NK 11
NK_CD56bright 2


In [None]:
import json

# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\PBMC_Hao_batch_noZ\Level2\ACT_annotation")
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level2/ACT_annotation')
with open('L2_PreLect_ACT_overlap.json', 'w') as f:
    json.dump(overlapped_dict, f)