In [1]:
from pathlib import Path

# Get the current working directory as a Path object
current_path = Path.cwd()
home_folder = 'evan_home'

# Traverse up the directory tree until you find the target folder
for parent in [current_path] + list(current_path.parents):
    if parent.name == home_folder:
        home_path = parent
        break
else:
    raise ValueError(f"Folder '{home_folder}' not found in the current working directory.")

print("Home Path:", home_path)
source_code_dir = home_path / 'Source_code'
dataset_dir = home_path / 'Dataset'


Home Path: c:\Users\evanlee\Documents\Bmi_NAS_evan\evan_home


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import os

In [3]:
# adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_Harmony_test_no_scale.h5ad')
adata = sc.read_h5ad(dataset_dir / 'PBMC_Hao/GSE164378_Hao/Harmony_noZ/Hao_Harmony_test_no_scale.h5ad')

adata.obs['celltype.l1'] = adata.obs['celltype.l1'].str.replace(' ', '_')
labels = adata.obs['celltype.l1']
types = np.unique(labels).tolist()
types

['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']

## Read PreLect features

In [4]:
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level1/feature_selection_k3')
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/feature_selection_k3')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        feature_df['abs_weight'] = feature_df['Weight'].abs()
        feature_df = feature_df.sort_values(by=['Tendency', 'abs_weight'], ascending=[False, False])
        feature_df.drop(columns=['abs_weight'], inplace=True)
        features_dict[celltype] = feature_df
    except:
        print('skipping:', celltype)
        continue
    # print(celltype, 'Feature count:', feature_df.shape[0])
    # print(celltype, 'Positive feature count:', feature_df[feature_df['Tendency'] == 1].shape[0])
    # print('------------------')

count_df = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df = features_dict[celltype]
    feature_count = feature_df.shape[0]
    positive_count = feature_df[feature_df['Tendency'] == 1].shape[0]
    count_df.loc[celltype] = [feature_count, positive_count]
count_df

Unnamed: 0,Feature_count,Positive_feature_count
B,19,10
CD4_T,201,95
CD8_T,23,9
DC,50,23
Mono,50,20
NK,33,17
other,5,3
other_T,247,112


In [5]:
# write to query for ACT
text = ''
for celltype in types:
    features = features_dict[celltype]['Gene'].tolist()
    features = ', '.join(features)
    # print(features)
    text_celltype = f'{celltype}: {features}\n'
    text += text_celltype

In [6]:
print(text)

B: CD79A, MS4A1, IGKC, BANK1, IGLC2, RALGPS2, CD79B, CD74, IGHM, HLA-DRA, TMSB4X, B2M, PTPRC, MALAT1, FTH1, ACTB, S100A4, MYL12A, ITM2B
CD4_T: CD4, TRAT1, TOMM7, CD40LG, CD52, RNASET2, ITM2A, CD2, TRBC1, IL7R, TRAC, TMSB10, IL32, CD3G, CD6, ITGB1, GIMAP7, TRBC2, ARID5B, LTB, RORA, CORO1B, MAF, FYB1, S100A4, MAL, GAPDH, IL6ST, GSTK1, CD3D, CD247, ARL4C, CALM1, ETS1, NSD3, CD3E, GPR183, S100A11, ADD3, RPS20, SMCHD1, RPL38, KLF3, TPT1, CYLD, ANKRD12, RPL36, KLF2, ANXA1, EML4, RPL11, HMGB2, EMP3, LDHB, AAK1, SYNE2, H1FX, INPP4B, AES, FXYD5, LIMS1, TNFAIP3, FOXP1, RPL24, IKZF1, NOP53, PTPRC, MZT2B, ATP5MC2, SOD1, RPS25, GMFG, JUND, RPL39, MALAT1, EID1, NDUFS5, RPS27, RPL30, RPS29, FTH1, MT-CYB, RPL34, RPL14, RPL9, RPL3, RPL37, RPS15A, RPS26, PFN1, RPSA, RPL23, RPL35, RPL36AL, C12orf57, HCST, CCL5, CTSW, CD7, MT-ND4, CD74, FTL, NUCB2, CYBA, ATM, HSP90AB1, MT-CO2, DUSP2, H3F3A, HLA-DPB1, HLA-A, RPS4Y1, HLA-DRA, AIF1, HLA-C, UBB, OAZ1, PDCD4, NKG7, MT-CO1, MT-CO3, GNAS, GUK1, GYPC, KLF13, TXNI

## Read PreLect features (large lambda, <40 features)

In [7]:
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch_noZ/Level1/large_lambda/features')
os.chdir(source_code_dir / 'PBMC_Hao_batch_noZ/Level1/large_lambda/features')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        feature_df['abs_weight'] = feature_df['Weight'].abs()
        feature_df = feature_df.sort_values(by=['Tendency', 'abs_weight'], ascending=[False, False])
        feature_df.drop(columns=['abs_weight'], inplace=True)
        features_dict[celltype] = feature_df
    except:
        print('skipping:', celltype)
        continue
    # print(celltype, 'Feature count:', feature_df.shape[0])
    # print(celltype, 'Positive feature count:', feature_df[feature_df['Tendency'] == 1].shape[0])
    # print('------------------')

count_df = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df = features_dict[celltype]
    feature_count = feature_df.shape[0]
    positive_count = feature_df[feature_df['Tendency'] == 1].shape[0]
    count_df.loc[celltype] = [feature_count, positive_count]
count_df

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'c:\\Users\\evanlee\\Documents\\Bmi_NAS_evan\\evan_home\\Source_code\\PBMC_Hao_batch_noZ\\Level1\\large_lambda\\features'

In [4]:
features_dict.keys()

dict_keys(['CD4_T', 'DC', 'Mono', 'other_T'])

In [5]:
# write to query for ACT
text = ''
for celltype in types:
    if celltype not in features_dict.keys():
        print('Skipping', celltype)
        continue
    features = features_dict[celltype]['Gene'].tolist()
    features = ', '.join(features)
    # print(features)
    text_celltype = f'{celltype}: {features}\n'
    text += text_celltype

Skipping B
Skipping CD8_T
Skipping NK
Skipping other


In [6]:
print(text)

CD4_T: IL7R, TRAC, LTB, FYB1, CD2, IL32, CD52, CD3G, LDHB, TRBC1, ITGB1, CD3D, TOMM7, GIMAP7, TRBC2, S100A4, GSTK1, TMSB10, RNASET2, RPS26, HCST, CCL5, CD74, MT-ND4, MT-CO1, FTL, CYBA, MT-CO2, HLA-B, MT-ND2, MT-ND1, MT-ND3, NKG7, MT-CO3, OAZ1, HSP90AB1
DC: CST3, FCER1A, PPP1R14B, CCDC88A, HLA-DQA1, HLA-DPB1, ARL4C, PLD4, CCDC50, HLA-DPA1, HERPUD1, HLA-DRA, TXN, SEC61B, SAMHD1, CD74, FTL, MALAT1, CD52, NEAT1, HLA-E, KLF2, BTG1, TXNIP, B2M, HLA-C, MT-ND2, MT-ND3, MT-ATP6, SAT1, MT-CO2, CTSS, PTPRC, S100A9, HLA-A, MT-CO3, S100A8
Mono: CTSS, NEAT1, PSAP, CFD, S100A8, AIF1, SERPINA1, CYBB, FCN1, LST1, S100A9, SAT1, FTL, NFKBIA, MARCKS, CCL3, DUSP1, CD14, B2M, RPS18, RPS27, MYL12A, HLA-DPB1, MT-ATP6, MALAT1, RPS29, TMSB4X, RPL3, TAGLN2, RPS15A, MT-CYB, RPL13A, OST4, RPS3, RPS5, RPSA, RPS3A, TXNIP, GNAS
other_T: TRDC, CD3D, KLRB1, GZMK, CD3G, TRGC2, TRGC1, IL7R, DUSP2, NCR3, LYAR, CD3E, S100A6, CXCR4, PHACTR2, IL32, MT2A, ARL4C, KLRG1, ZFP36L2, SPOCK2, SYNE2, PPP2R5C, TMSB10, B2M, NEAT1, MALA