In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import os



In [2]:
adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/batch_corrected/Hao_PBMC_Harmony_unscaled.h5ad')
adata.obs['celltype.l1'] = adata.obs['celltype.l1'].str.replace(' ', '_')
labels = adata.obs['celltype.l1']
types = np.unique(labels).tolist()
types

['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']

## Read PreLect features

In [7]:
os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch/Level1/feature_selection')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        feature_df['abs_weight'] = feature_df['Weight'].abs()
        feature_df = feature_df.sort_values(by=['Tendency', 'abs_weight'], ascending=[False, False])
        feature_df.drop(columns=['abs_weight'], inplace=True)
        features_dict[celltype] = feature_df
    except:
        print('skipping:', celltype)
        continue
    # print(celltype, 'Feature count:', feature_df.shape[0])
    # print(celltype, 'Positive feature count:', feature_df[feature_df['Tendency'] == 1].shape[0])
    # print('------------------')

count_df = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df = features_dict[celltype]
    feature_count = feature_df.shape[0]
    positive_count = feature_df[feature_df['Tendency'] == 1].shape[0]
    count_df.loc[celltype] = [feature_count, positive_count]
count_df

Unnamed: 0,Feature_count,Positive_feature_count
B,11,6
CD4_T,346,197
CD8_T,22,7
DC,32,15
Mono,19,10
NK,33,17
other,4,2
other_T,59,25


In [10]:
# write to query for ACT
text = ''
for celltype in types:
    features = features_dict[celltype]['Gene'].tolist()
    features = ', '.join(features)
    # print(features)
    text_celltype = f'{celltype}: {features}\n'
    text += text_celltype

In [11]:
print(text)

B: CD79A, MS4A1, IGKC, HLA-DRA, CD74, IGHM, TMSB4X, B2M, MALAT1, ACTB, S100A4
CD4_T: CD4, CD40LG, TRAT1, TOMM7, ITM2A, RNASET2, CD52, CD2, TRBC1, TMSB10, CD6, CORO1B, MAF, IL7R, IL6ST, TRAC, CD3G, ARID5B, GIMAP7, MAL, GAPDH, TRBC2, TSHZ2, IL32, LTB, RORA, CALM1, S100A4, CD247, ITGB1, ARL4C, FYB1, GPR183, ADD3, TPT1, ETS1, KLF3, CD3D, LIMS1, NSD3, PTPRC, RPL11, GSTK1, EML4, S100A11, SYNE2, SMCHD1, RPS20, CYLD, EMP3, IKZF1, HMGB2, INPP4B, AAK1, RPL24, TNFAIP3, H1FX, ANXA1, KLF2, CD3E, USP15, RPL36, GMFG, ANKRD12, AES, RPS27, EID1, TXN, RPL38, ATP5MC2, LDHB, FOXP1, LINC00861, SOD1, FHIT, JUND, RPL30, PBXIP1, AQP3, NOP53, OPTN, TUBA1B, BIRC3, FTH1, FXYD5, PRDX2, MZT2A, SUB1, MZT2B, NDUFA12, C12orf57, TNFAIP8, RPS25, NDUFS5, MALAT1, RPL3, PSIP1, STK17B, APRT, CD69, RPL39, PTGES3, ANP32B, CHD3, RPL36AL, BCL11B, RASGRP1, PFN1, KLRB1, H3F3B, NSA2, MT-CYB, RPS29, RPL34, RPS15A, RPS14, RPL37, RPL14, RPL9, RPL29, RPSA, RPL35, RPL23, MT-ATP6, B2M, TMSB4X, RPL10, RPS27A, RPL41, RPL32, RPL13, CDC42,