In [1]:
from pathlib import Path

# Get the current working directory as a Path object
current_path = Path.cwd()
home_folder = 'evan_home'

# Traverse up the directory tree until you find the target folder
for parent in [current_path] + list(current_path.parents):
    if parent.name == home_folder:
        home_path = parent
        break
else:
    raise ValueError(f"Folder '{home_folder}' not found in the current working directory.")

print("Home Path:", home_path)
source_code_dir = home_path / 'Source_code'
dataset_dir = home_path / 'Dataset'


Home Path: c:\Users\evanlee\Documents\Bmi_NAS_evan\evan_home


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import os

In [3]:
clusters = [f'Leiden_{i}' for i in range(24)]
clusters

['Leiden_0',
 'Leiden_1',
 'Leiden_2',
 'Leiden_3',
 'Leiden_4',
 'Leiden_5',
 'Leiden_6',
 'Leiden_7',
 'Leiden_8',
 'Leiden_9',
 'Leiden_10',
 'Leiden_11',
 'Leiden_12',
 'Leiden_13',
 'Leiden_14',
 'Leiden_15',
 'Leiden_16',
 'Leiden_17',
 'Leiden_18',
 'Leiden_19',
 'Leiden_20',
 'Leiden_21',
 'Leiden_22',
 'Leiden_23']

## Read PreLect features

In [4]:
# os.chdir(r"C:\Users\evanlee\Documents\GitHub\EvanPys\Progress\HCC_case_study\feature_selection_k3")
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/HCC_case_study/feature_selection_k3')
os.chdir(source_code_dir / 'HCC_case_study/feature_selection_k3')

features_dict = {}
positive_features_dict = {}
# Read features for each leiden cluster
for celltype in clusters:
    try:
        feature_df = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        feature_df['abs_weight'] = feature_df['Weight'].abs()
        feature_df = feature_df.sort_values(by=['Tendency', 'abs_weight'], ascending=[False, False])
        feature_df.drop(columns=['abs_weight'], inplace=True)
        positive_df = feature_df.loc[feature_df['Tendency'] == 1, :]

        features_dict[celltype] = feature_df
        positive_features_dict[celltype] = positive_df
    except:
        print('skipping:', celltype)
        continue
    # print(celltype, 'Feature count:', feature_df.shape[0])
    # print(celltype, 'Positive feature count:', feature_df[feature_df['Tendency'] == 1].shape[0])
    # print('------------------')

count_df = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df = features_dict[celltype]
    feature_count = feature_df.shape[0]
    positive_count = feature_df[feature_df['Tendency'] == 1].shape[0]
    count_df.loc[celltype] = [feature_count, positive_count]
# row_order = [f'Leiden_{i}' for i in range(24)]
# count_df = count_df.loc[row_order]
count_df

Unnamed: 0,Feature_count,Positive_feature_count
Leiden_0,41,27
Leiden_1,91,44
Leiden_2,32,13
Leiden_3,117,60
Leiden_4,34,21
Leiden_5,182,69
Leiden_6,38,21
Leiden_7,63,30
Leiden_8,99,48
Leiden_9,69,42


In [9]:
# write to query for ACT
# all features
all_text = ''
for celltype in clusters:
    features = features_dict[celltype]['Gene'].tolist()
    features = ', '.join(features)
    # print(features)
    text_celltype = f'{celltype}: {features}\n'
    all_text += text_celltype

In [10]:
print(all_text)

Leiden_0: MT-ND1, NDUFB9, NEAT1, TMEM176A, AMBP, SERPINA1, VTN, MPC2, MT-CO1, RARRES2, SPINK1, ANG, PRAP1, BNIP3, HSPA5, KRT18, TM4SF4, MDK, MT-CO2, NDUFC2, RBP4, SEC61G, CD63, COX6C, COX6A1, S100A10, HSP90B1, TUBA1B, B2M, PTMA, TMSB4X, CD74, H2AFZ, RPL10, ALB, HLA-DRA, APOA1, HMGB1, MT2A, ACTB, SRGN
Leiden_1: IL7R, CXCR4, ANXA1, GZMK, ZFP36L2, CD3D, RGCC, CD69, RPS29, TRAC, LTB, LEPROTL1, RPS25, S100A4, CD52, RPS27, SRGN, TNFAIP3, RORA, CD3E, RP11-138A9.1, SARAF, GPR183, RPL28, RPL17, RPL31, CD2, RPS4Y1, RPL36A, CCL5, BTG1, BIRC3, AC090498.1, JUNB, RPS12, CD44, RPL10A, IL32, YPEL5, CREM, MCL1, ZFP36, RPS15A, RPL23A, NKG7, ACTB, CMC1, FTL, CD74, IGKC, HLA-DRA, DDX5, H3F3A, APOA2, ALB, GAPDH, HLA-DRB1, CD63, MALAT1, SAT1, HMGB1, MT-CO1, HLA-DPB1, NEAT1, MT-ND3, SRP14, UBB, HLA-C, MT-CO2, SERF2, HNRNPA2B1, MT-ND4, MYL6, HLA-A, CYBA, OAZ1, LGALS1, PPIA, HSPE1, ACTG1, TMBIM6, SUB1, FOS, HSPA5, HSP90AA1, MT-CO3, DUSP1, UBC, HLA-B, HLA-E, PFN1
Leiden_2: C1QA, CTSB, TYROBP, GPNMB, LGMN, C1QB,

In [14]:
# positive features
positive_text = ''
for celltype in clusters:
    positive_features = positive_features_dict[celltype]['Gene'].tolist()
    positive_features = ', '.join(positive_features)

    text_celltype = f'{celltype}: {positive_features}\n'
    positive_text += text_celltype

In [15]:
print(positive_text)

Leiden_0: MT-ND1, NDUFB9, NEAT1, TMEM176A, AMBP, SERPINA1, VTN, MPC2, MT-CO1, RARRES2, SPINK1, ANG, PRAP1, BNIP3, HSPA5, KRT18, TM4SF4, MDK, MT-CO2, NDUFC2, RBP4, SEC61G, CD63, COX6C, COX6A1, S100A10, HSP90B1
Leiden_1: IL7R, CXCR4, ANXA1, GZMK, ZFP36L2, CD3D, RGCC, CD69, RPS29, TRAC, LTB, LEPROTL1, RPS25, S100A4, CD52, RPS27, SRGN, TNFAIP3, RORA, CD3E, RP11-138A9.1, SARAF, GPR183, RPL28, RPL17, RPL31, CD2, RPS4Y1, RPL36A, CCL5, BTG1, BIRC3, AC090498.1, JUNB, RPS12, CD44, RPL10A, IL32, YPEL5, CREM, MCL1, ZFP36, RPS15A, RPL23A
Leiden_2: C1QA, CTSB, TYROBP, GPNMB, LGMN, C1QB, C1QC, CCL3, CTSD, PSAP, FABP5, RNASE1, CCL4L2
Leiden_3: TYROBP, LST1, AIF1, LYZ, SAT1, HLA-DPB1, HLA-DRA, CST3, HLA-DPA1, NEAT1, GPX1, COTL1, FTH1, BCL2A1, IL1B, TIMP1, LGALS2, G0S2, SLC25A6, FCN1, HLA-DQB1, APOA2, NAP1L1, CSTA, SRGN, LSP1, RPL26, FCER1G, S100A6, LGALS1, C15orf48, RPS3A, SH3BGRL3, ENO1, RPS24, NAMPT, PLAUR, SOD2, ANXA1, S100A4, AP1S2, CSTB, RPL18A, MT2A, S100A10, GABARAP, RPS2, RPL29, PLIN2, C1orf162