In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import os

In [3]:
adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/batch_corrected/Hao_PBMC_Harmony_unscaled.h5ad')
adata.obs['celltype.l2'] = adata.obs['celltype.l2'].str.replace(' ', '_')
labels = adata.obs['celltype.l2']
types = np.unique(labels).tolist()
types

['ASDC',
 'B_intermediate',
 'B_memory',
 'B_naive',
 'CD14_Mono',
 'CD16_Mono',
 'CD4_CTL',
 'CD4_Naive',
 'CD4_Proliferating',
 'CD4_TCM',
 'CD4_TEM',
 'CD8_Naive',
 'CD8_Proliferating',
 'CD8_TCM',
 'CD8_TEM',
 'Doublet',
 'Eryth',
 'HSPC',
 'ILC',
 'MAIT',
 'NK',
 'NK_CD56bright',
 'NK_Proliferating',
 'Plasmablast',
 'Platelet',
 'Treg',
 'cDC1',
 'cDC2',
 'dnT',
 'gdT',
 'pDC']

## Read PreLect features

In [4]:
os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch/Level2_log_inv/feature_selection')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        feature_df = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        feature_df['abs_weight'] = feature_df['Weight'].abs()
        feature_df = feature_df.sort_values(by=['Tendency', 'abs_weight'], ascending=[False, False])
        feature_df.drop(columns=['abs_weight'], inplace=True)
        features_dict[celltype] = feature_df
    except:
        print('skipping:', celltype)
        continue
    # print(celltype, 'Feature count:', feature_df.shape[0])
    # print(celltype, 'Positive feature count:', feature_df[feature_df['Tendency'] == 1].shape[0])
    # print('------------------')

count_df = pd.DataFrame(columns=['Feature_count', 'Positive_feature_count'])
for celltype in features_dict.keys():
    feature_df = features_dict[celltype]
    feature_count = feature_df.shape[0]
    positive_count = feature_df[feature_df['Tendency'] == 1].shape[0]
    count_df.loc[celltype] = [feature_count, positive_count]
count_df

Unnamed: 0,Feature_count,Positive_feature_count
ASDC,27,16
B_intermediate,16,7
B_memory,35,16
B_naive,8,6
CD14_Mono,17,9
CD16_Mono,15,5
CD4_CTL,90,31
CD4_Naive,151,91
CD4_Proliferating,26,19
CD4_TCM,82,48


In [5]:
features_dict['ASDC']

Unnamed: 0,Gene,Weight,Tendency
19,PPP1R14A,1.672309,1
1,S100A10,1.09801,1
4,PLAC8,0.556223,1
17,CST3,0.373947,1
5,SOX4,0.371989,1
16,TCF4,0.303589,1
20,AXL,0.152106,1
11,C12orf75,0.133231,1
15,IRF8,0.12114,1
18,SAMHD1,0.104704,1


In [6]:
# write to query for ACT
text = ''
for celltype in types:
    features = features_dict[celltype]['Gene'].tolist()
    features = ', '.join(features)
    # print(features)
    text_celltype = f'Clus_{celltype}: {features}\n'
    text += text_celltype

In [7]:
print(text)

Clus_ASDC: PPP1R14A, S100A10, PLAC8, CST3, SOX4, TCF4, AXL, C12orf75, IRF8, SAMHD1, TXN, APP, FCGRT, ANXA2, PLP2, CCDC50, MALAT1, TMSB4X, FTL, S100A9, CD52, B2M, MT-ND2, SAT1, MT-CO2, MT-CO3, RPLP1
Clus_B_intermediate: MS4A1, RALGPS2, BANK1, IGHM, GPR183, CD79A, JUND, TMSB10, TXNIP, TMSB4X, CXCR4, MALAT1, ACTB, TPT1, MT-CO3, MT-CO2
Clus_B_memory: HLA-DRA, LTB, LINC01781, BANK1, AIM2, MS4A1, BLK, IGHA1, ITGB1, TNFRSF13C, CD74, ARID5B, TXNIP, MARCKS, IGHG1, IGKC, IGHM, PTPRC, TMSB4X, FTH1, MALAT1, ITM2B, B2M, MT-CO1, SARAF, BTG1, SERF2, LYZ, CTSS, MYL6, JUNB, H3F3B, TMSB10, DDX5, FOXP1
Clus_B_naive: TCL1A, IGHM, IGHD, CXCR4, MS4A1, HLA-DRA, B2M, MALAT1
Clus_CD14_Mono: LYZ, CD14, VCAN, S100A8, S100A9, FCN1, NEAT1, CTSS, NFKBIA, RPS19, MALAT1, RPS27, B2M, TMSB4X, MT-ATP6, RPS29, RPS18
Clus_CD16_Mono: FCGR3A, LST1, AIF1, MS4A7, CFD, MALAT1, B2M, RPLP1, S100A9, S100A8, RPL13, RPS27, TMSB4X, RPS18, MT-ATP6
Clus_CD4_CTL: GZMH, FGFBP2, NKG7, IL7R, CD4, S100A4, CD52, TRBC1, ITGB1, HOPX, IL32, GN

In [8]:
# write query to txt
os.chdir('/Users/evanli/Documents/EvanPys/Progress/PBMC_Hao_batch/Level2_log_inv/ACT_annotation')
with open('L2_query.txt', 'w') as f:
    f.write(text)