In [1]:
from pathlib import Path

# Get the current working directory as a Path object
current_path = Path.cwd()
home_folder = 'evan_home'

# Traverse up the directory tree until you find the target folder
for parent in [current_path] + list(current_path.parents):
    if parent.name == home_folder:
        home_path = parent
        break
else:
    raise ValueError(f"Folder '{home_folder}' not found in the current working directory.")

print("Home Path:", home_path)
source_code_dir = home_path / 'Source_code'
dataset_dir = home_path / 'Dataset'


Home Path: c:\Users\evanlee\Documents\Bmi_NAS_evan\evan_home


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import os

In [3]:
# adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_Harmony_test_no_scale.h5ad')
adata = sc.read_h5ad(dataset_dir / 'PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_Harmony_test_no_scale.h5ad')
adata.obs['celltype.l2'] = adata.obs['celltype.l2'].str.replace(' ', '_')
labels = adata.obs['celltype.l2']
types = np.unique(labels).tolist()
types

['ASDC',
 'B_intermediate',
 'B_memory',
 'B_naive',
 'CD14_Mono',
 'CD16_Mono',
 'CD4_CTL',
 'CD4_Naive',
 'CD4_Proliferating',
 'CD4_TCM',
 'CD4_TEM',
 'CD8_Naive',
 'CD8_Proliferating',
 'CD8_TCM',
 'CD8_TEM',
 'Doublet',
 'Eryth',
 'HSPC',
 'ILC',
 'MAIT',
 'NK',
 'NK_CD56bright',
 'NK_Proliferating',
 'Plasmablast',
 'Platelet',
 'Treg',
 'cDC1',
 'cDC2',
 'dnT',
 'gdT',
 'pDC']

## Read PreLect features

In [4]:
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level2/feature_selection_k3')
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level2/feature_selection_k3')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        feature_df['abs_weight'] = feature_df['Weight'].abs()
        feature_df = feature_df.sort_values(by=['Tendency', 'abs_weight'], ascending=[False, False])
        feature_df.drop(columns=['abs_weight'], inplace=True)
        features_dict[celltype] = feature_df
    except:
        print('skipping:', celltype)
        continue
    # print(celltype, 'Feature count:', feature_df.shape[0])
    # print(celltype, 'Positive feature count:', feature_df[feature_df['Tendency'] == 1].shape[0])
    # print('------------------')

count_df = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df = features_dict[celltype]
    feature_count = feature_df.shape[0]
    positive_count = feature_df[feature_df['Tendency'] == 1].shape[0]
    count_df.loc[celltype] = [feature_count, positive_count]
count_df

Unnamed: 0,Feature_count,Positive_feature_count
ASDC,47,25
B_intermediate,96,49
B_memory,170,90
B_naive,9,6
CD14_Mono,29,16
CD16_Mono,37,17
CD4_CTL,122,45
CD4_Naive,730,393
CD4_Proliferating,70,35
CD4_TCM,247,133


In [5]:
# write to query for ACT
text = ''
for celltype in types:
    features = features_dict[celltype]['Gene'].tolist()
    features = ', '.join(features)
    # print(features)
    text_celltype = f'Clus_{celltype}: {features}\n'
    text += text_celltype

In [6]:
print(text)

Clus_ASDC: PPP1R14A, S100A10, PLAC8, SOX4, TCF4, CST3, SAMHD1, APP, ANXA2, AXL, C12orf75, CTSH, FCGRT, IRF8, LILRA4, DAB2, TXN, ALOX5AP, PLP2, PLA2G16, CCDC50, RAC1, PLD4, RAB11FIP1, HLA-DPA1, MALAT1, TMSB4X, S100A9, CD52, FTL, B2M, SAT1, MT-ND2, RPLP1, RPS28, PTPRC, RPS12, MT-CO2, MT-CO1, MT-ATP6, MT-CO3, RPL13, BTG1, RPL34, RPL32, TPT1, MT-ND1
Clus_B_intermediate: LINC01857, IGHM, RALGPS2, MS4A1, ARHGAP24, JUND, BANK1, GPR183, TNFRSF13B, FCRL2, CD79A, POU2F2, FCRL3, COTL1, PTPN1, TCF4, CCDC50, ZBTB20, DDX21, PLAC8, HLA-DPA1, EMP3, RGS2, JUNB, CLECL1, CXXC5, CIB1, HLA-DPB1, HLA-DRA, EZR, IFT57, UBC, TSC22D3, HERPUD1, CD82, SAT1, TNFRSF13C, HLA-DQB1, DDX24, SEPT6, PLEKHO1, ANKRD12, SYK, ZFAS1, HSP90AB1, TOMM7, RAC2, TMA7, CD48, TMSB10, TXNIP, CXCR4, MEF2C, IGHD, TCL1A, TMSB4X, SELL, ACTB, H1FX, MT-CO3, NFKBIA, TPT1, MT-CO2, HMGB1, IGKC, FTL, TAGLN2, HVCN1, NAP1L1, MALAT1, SMAP2, OAZ1, MT-CYB, LYZ, ITM2B, LTB, PLPP5, ACTG1, MT-CO1, GABPB1-AS1, MDM4, HNRNPA1, S100A9, SSR4, MT-ATP6, SARAF