In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import os



## Read PreLect features

In [2]:
clusters = [f'Leiden_{i}' for i in range(0, 16)]
print(clusters)

['Leiden_0', 'Leiden_1', 'Leiden_2', 'Leiden_3', 'Leiden_4', 'Leiden_5', 'Leiden_6', 'Leiden_7', 'Leiden_8', 'Leiden_9', 'Leiden_10', 'Leiden_11', 'Leiden_12', 'Leiden_13', 'Leiden_14', 'Leiden_15']


In [4]:
clusters[0]

'Leiden_0'

In [7]:
os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Zheng/Leiden_HVG_PreLect/feature_selection_symbol')
df = pd.read_csv('Leiden_0_features_symbol.txt', names=['Gene', 'Weight', 'Tendency', 'Symbol'], sep='\t', header=0)
df

Unnamed: 0,Gene,Weight,Tendency,Symbol
0,ENSG00000142669,0.046448,1,SH3BGRL3
1,ENSG00000169442,0.279994,1,CD52
2,ENSG00000122406,0.041489,1,RPL5
3,ENSG00000197747,0.116894,1,S100A10
4,ENSG00000196154,1.376208,1,S100A4
...,...,...,...,...
92,ENSG00000198888,-0.057154,0,MT-ND1
93,ENSG00000198804,-0.265438,0,MT-CO1
94,ENSG00000198712,-0.183598,0,MT-CO2
95,ENSG00000198938,-0.117060,0,MT-CO3


In [8]:
os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Zheng/Leiden_HVG_PreLect/feature_selection_symbol')

features_dict = {}
# Read features for each celltype
for celltype in clusters:
    try:
        file = f'{celltype}_features_symbol.txt'
        print(file)
        feature_df = pd.read_csv(file, names=['Gene', 'Weight', 'Tendency', 'Symbol'], sep='\t', header=0)
        feature_df['abs_weight'] = feature_df['Weight'].abs()
        feature_df = feature_df.sort_values(by=['Tendency', 'abs_weight'], ascending=[False, False])
        feature_df.drop(columns=['abs_weight'], inplace=True)
        features_dict[celltype] = feature_df
    except:
        print('skipping:', celltype)
        continue
    # print(celltype, 'Feature count:', feature_df.shape[0])
    # print(celltype, 'Positive feature count:', feature_df[feature_df['Tendency'] == 1].shape[0])
    # print('------------------')

count_df = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df = features_dict[celltype]
    feature_count = feature_df.shape[0]
    positive_count = feature_df[feature_df['Tendency'] == 1].shape[0]
    count_df.loc[celltype] = [feature_count, positive_count]
count_df

Leiden_0_features_symbol.txt
Leiden_1_features_symbol.txt
Leiden_2_features_symbol.txt
Leiden_3_features_symbol.txt
Leiden_4_features_symbol.txt
Leiden_5_features_symbol.txt
Leiden_6_features_symbol.txt
Leiden_7_features_symbol.txt
Leiden_8_features_symbol.txt
Leiden_9_features_symbol.txt
Leiden_10_features_symbol.txt
Leiden_11_features_symbol.txt
Leiden_12_features_symbol.txt
Leiden_13_features_symbol.txt
Leiden_14_features_symbol.txt
Leiden_15_features_symbol.txt


Unnamed: 0,Feature_count,Positive_feature_count
Leiden_0,97,45
Leiden_1,21,12
Leiden_2,141,79
Leiden_3,89,46
Leiden_4,152,106
Leiden_5,17,7
Leiden_6,22,6
Leiden_7,12,4
Leiden_8,7,5
Leiden_9,7,3


In [11]:
# write to query for ACT
text = ''
for celltype in clusters:
    features = features_dict[celltype]['Symbol'].tolist()
    features = ', '.join(features)
    # print(features)
    text_celltype = f'Clus_{celltype}: {features}\n'
    text += text_celltype

In [12]:
print(text)

Clus_Leiden_0: IL32, S100A4, LTB, FXYD5, FTH1, LDHB, CD52, B2M, VIM, GSTK1, TPT1, PLP2, ANXA1, RPL36, S100A10, EEF1A1, HINT1, JUNB, PFN1, CD99, RPS27, IER2, RPSA, RPS25, RPL4, RPL30, IL7R, SH3BGRL3, GAPDH, RPL5, RPL29, RPL36A, RPS29, EEF2, RPL14, GIMAP7, RPL9, GLTSCR2, ITM2B, CD3E, COTL1, RPS27A, RPL35, PPDPF, TAGLN2, CCL5, CD74, RPL13, MALAT1, RPS2, CYBA, FTL, MT-CO1, CTSW, RPS14, HCST, HLA-B, RPS19, CD7, RPS24, MT-CO2, RPS9, TMSB4X, COX4I1, SERF2, MT-CO3, RPL28, ACTB, MYL6, RPL19, RPS15, HLA-C, CORO1A, CFL1, MT-ND1, TMSB10, RPL34, RPS23, MT-ND4, RPL15, ACTG1, UBB, PTMA, ARPC3, LIMD2, ID2, ATP5E, RPL13A, ARPC2, EIF3K, PTPRCAP, RPS16, RPL10, RPS5, EEF1D, BTG1, GMFG
Clus_Leiden_1: CCL5, GZMK, IL32, DUSP1, JUN, LTB, DUSP2, JUNB, FOS, RPLP0, EEF1A1, S100A4, GNLY, TMSB4X, TMSB10, FTH1, OAZ1, MALAT1, B2M, CD74, ARHGDIB
Clus_Leiden_2: JUN, ACTB, LTB, S100A4, PFN1, IER2, LDHB, RPL34, JUNB, CD3D, GIMAP7, TPT1, CD27, EEF1A1, RPS25, RPL36, FTH1, NOSIP, CD3E, PABPC1, FXYD5, RPL5, CD52, RPL9, RPL2

In [None]:
# write query to txt
# os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch/Level2_log_inv/ACT_annotation')
# with open('L2_query.txt', 'w') as f:
#     f.write(text)