In [1]:
#%%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.metrics import roc_auc_score, roc_curve, auc, precision_score, f1_score, accuracy_score

import os
import scanpy as sc

#%% Load data
data = sc.read_h5ad('/home/jovyan/work/Research_datasets/Hao_PBMC.h5ad')
print('Original data shape:', data.shape)

# CPM
sc.pp.normalize_total(data, target_sum=1e6)
# log1p
sc.pp.log1p(data)

data.obs['celltype.l2'] = data.obs['celltype.l2'].str.replace(' ', '_')
label = data.obs['celltype.l2'].tolist()
types = np.unique(label).tolist()
# types = [s.replace(' ', '_') for s in types]
print('All cell types:', types)


#%% Read feature dict
os.chdir('/home/jovyan/work/GitHub/EvanPys/Progress/PBMC_Hao/Level2_pvl0_ttsplit/lambda_decision_ignore_section/L2_feature_selection')

features_dict = {}
# Read features for each celltype
for celltype in types:
    try:
        print('==================')
        print('Reading features:', celltype)
        feature_df = pd.read_csv(f'{celltype}_features.txt', names=['Gene', 'Weight', 'Tendency'], sep='\t')
        features_dict[celltype] = feature_df
    except:
        print('skipping:', celltype)
        continue
    # print(celltype, 'Feature count:', feature_df.shape[0])
    # print(celltype, 'Positive feature count:', feature_df[feature_df['Tendency'] == 1].shape[0])
    # print('------------------')


Original data shape: (161764, 20568)
All cell types: ['ASDC', 'B_intermediate', 'B_memory', 'B_naive', 'CD14_Mono', 'CD16_Mono', 'CD4_CTL', 'CD4_Naive', 'CD4_Proliferating', 'CD4_TCM', 'CD4_TEM', 'CD8_Naive', 'CD8_Proliferating', 'CD8_TCM', 'CD8_TEM', 'Doublet', 'Eryth', 'HSPC', 'ILC', 'MAIT', 'NK', 'NK_CD56bright', 'NK_Proliferating', 'Plasmablast', 'Platelet', 'Treg', 'cDC1', 'cDC2', 'dnT', 'gdT', 'pDC']
Reading features: ASDC
Reading features: B_intermediate
Reading features: B_memory
Reading features: B_naive
Reading features: CD14_Mono
Reading features: CD16_Mono
Reading features: CD4_CTL
Reading features: CD4_Naive
Reading features: CD4_Proliferating
Reading features: CD4_TCM
Reading features: CD4_TEM
Reading features: CD8_Naive
Reading features: CD8_Proliferating
Reading features: CD8_TCM
Reading features: CD8_TEM
Reading features: Doublet
Reading features: Eryth
Reading features: HSPC
Reading features: ILC
Reading features: MAIT
Reading features: NK
Reading features: NK_CD56bri

In [2]:
features_dict['CD8_TCM']

Unnamed: 0,Gene,Weight,Tendency
0,ENSG00000145425,-0.000267,0
1,ENSG00000075624,-0.06494,0
2,ENSG00000205542,-0.073277,0
3,ENSG00000167996,-0.151998,0
4,ENSG00000251562,-1.111446,0
5,ENSG00000166710,-0.07783,0
6,ENSG00000087086,-0.16889,0
7,ENSG00000198804,-0.240188,0
8,ENSG00000198712,-0.133361,0
9,ENSG00000198899,-0.074783,0


In [5]:
positive = features_dict['CD8_TCM'][features_dict['CD8_TCM']['Tendency'] == 1]['Gene'].tolist()
X = data[:, positive].X

In [6]:
X

<161764x0 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Row format>

In [9]:
X.shape[1]

0

In [10]:
len(X)

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]