In [1]:
import scanpy as sc
from sklearn.model_selection import train_test_split
import os

def train_test_split_adata(adata, test_size=0.2):
    cell_indices = adata.obs.index
    cell_indices = cell_indices[~cell_indices.duplicated(keep='first')]
    train_indices, test_indices = train_test_split(cell_indices, test_size=test_size)
    print(len(cell_indices), len(train_indices), len(test_indices))
    train_data = adata[train_indices]
    test_data = adata[test_indices]
    return train_data, test_data


In [20]:
# zheng68k
path = '/home/share/huadjyin/home/s_qiuping1/workspace/omics_model/data/finetune'
output = '/home/share/huadjyin/home/s_qiuping1/workspace/omics_model/bio_model/biollm/case/data/annotation/human/organs'
adata = sc.read_h5ad(path + '/Zheng68K.h5ad')
adata

AnnData object with n_obs × n_vars = 68450 × 16906
    obs: 'TSNE.1', 'TSNE.2', 'celltype', 'n_genes'
    uns: 'log1p'

In [6]:
adata.var['gene_name'] = adata.var_names.values

In [7]:
adata.X.max()

10.071463

In [21]:
adata.obs['celltype'].value_counts()

CD8+ Cytotoxic T                20757
CD8+/CD45RA+ Naive Cytotoxic    16645
CD56+ NK                         8775
CD4+/CD25 T Reg                  6185
CD19+ B                          5877
CD4+/CD45RO+ Memory              3059
CD14+ Monocyte                   2847
Dendritic                        2095
CD4+/CD45RA+/CD25- Naive T       1871
CD34+                             242
CD4+ T Helper2                     97
Name: celltype, dtype: int64

In [9]:
train_adata, test_adata = train_test_split_adata(adata, test_size=0.2)

68450 54760 13690


In [12]:
if not os.path.exists(os.path.join(output, 'zheng68k')):
    os.makedirs(os.path.join(output, 'zheng68k'), exist_ok=True)
train_adata.write_h5ad(output + '/zheng68k/train.h5ad')
test_adata.write_h5ad(output + '/zheng68k/test.h5ad')

In [2]:
# hpancreas
path = '/home/share/huadjyin/home/s_huluni/project/bio_model_240614/Data/INT/dataset4/hPancreas.h5ad'
output = '/home/share/huadjyin/home/s_qiuping1/workspace/omics_model/bio_model/biollm/case/data/annotation/human/organs'
adata = sc.read_h5ad(path)
adata

AnnData object with n_obs × n_vars = 14767 × 15558
    obs: 'batch', 'batchlb', 'celltype_orig', 'cellname', 'celltype'
    var: 'mean', 'std'
    uns: 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [3]:
adata.X.max()

15.35066730099875

In [4]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['celltype']

In [6]:
adata.obs['batch'].value_counts()

1    8569
3    2127
2    2122
5    1492
4     457
Name: batch, dtype: int64

In [8]:
adata.shape

(14767, 15558)

In [9]:
adata[adata.obs['batch'].isin([1,2,3]), :].obs['batch'].value_counts()

1    8569
3    2127
2    2122
Name: batch, dtype: int64

In [10]:
train_adata = adata[adata.obs['batch'].isin([1,2,3]), :].copy()
test_adata = adata[adata.obs['batch'].isin([4,5]), :].copy()
if not os.path.exists(os.path.join(output, 'hpancreas_intra')):
    os.makedirs(os.path.join(output, 'hpancreas_intra'), exist_ok=True)
train_adata.write_h5ad(output + '/hpancreas_intra/train.h5ad')
test_adata.write_h5ad(output + '/hpancreas_intra/test.h5ad')

In [13]:
train_adata.obs['Celltype'].value_counts()

alpha           4024
beta            3243
ductal          1708
acinar          1362
delta            908
gamma            553
stellate         511
endothelial      289
mesenchymal       80
macrophage        55
mast              32
epsilon           28
schwann           13
t_cell             7
MHC class II       5
Name: Celltype, dtype: int64

In [14]:
test_adata.obs['Celltype'].value_counts()

alpha          1076
beta            583
gamma           103
ductal           96
delta            58
mesenchymal      27
acinar            6
Name: Celltype, dtype: int64

In [15]:
# hPBMC
path = '/home/share/huadjyin/home/s_huluni/project/bio_model_240614/Data/INT/dataset5/hPBMC.h5ad'
output = '/home/share/huadjyin/home/s_qiuping1/workspace/omics_model/bio_model/biollm/case/data/annotation/human/organs'
adata = sc.read_h5ad(path)
adata

AnnData object with n_obs × n_vars = 15476 × 33694
    obs: 'Sample', 'n_counts', 'n_genes', 'batch', 'louvain', 'anno', 'Method', 'CellType'
    var: 'mean', 'std'
    uns: 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [16]:
adata.obs['batch'].value_counts()

0    8098
1    7378
Name: batch, dtype: int64

In [20]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['CellType']

In [21]:
train_adata = adata[adata.obs['batch'].isin([0]), :].copy()
test_adata = adata[adata.obs['batch'].isin([1]), :].copy()
if not os.path.exists(os.path.join(output, 'hPBMC_intra')):
    os.makedirs(os.path.join(output, 'hPBMC_intra'), exist_ok=True)
train_adata.write_h5ad(output + '/hPBMC_intra/train.h5ad')
test_adata.write_h5ad(output + '/hPBMC_intra/test.h5ad')

In [22]:
train_adata.obs['CellType'].value_counts()

CD4 T cell                     2267
CD8 T cell                     2076
Monocyte_CD14                  1914
B cell                         1199
NK cell                         303
Monocyte_FCGR3A                 206
Plasmacytoid dendritic cell      67
Megakaryocyte                    49
Hematopoietic stem cell          17
Name: CellType, dtype: int64

In [23]:
test_adata.obs['CellType'].value_counts()

CD4 T cell                     2183
Monocyte_CD14                  2176
B cell                         1172
CD8 T cell                     1066
Monocyte_FCGR3A                 355
NK cell                         290
Plasmacytoid dendritic cell      72
Megakaryocyte                    57
Hematopoietic stem cell           7
Name: CellType, dtype: int64

In [13]:
adata = sc.read_h5ad('/home/share/huadjyin/home/s_qiuping1/workspace/omics_model/bio_model/biollm/case/data/annotation/human/organs/eye/test.h5ad')
adata.var_names



Index(['DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2HG', 'MIR1302-2', 'FAM138A',
       'OR4G4P', 'OR4G11P', 'OR4F5', 'RP11-34P13.7',
       ...
       'RNU6-1314P', 'CYCSP48', 'ANKRD36P1', 'RP11-557B9.1', 'TPTE2P4',
       'CYCSP49', 'SLC25A15P1', 'PARP4P1', 'CCNQP2', 'CTBP2P1'],
      dtype='object', length=56445)

In [14]:
adata = sc.read_h5ad('/home/share/huadjyin/home/s_qiuping1/workspace/omics_model/bio_model/biollm/case/data/annotation/human/organs/lung/test.h5ad')
adata.var_names

Index(['MIR1302-2HG', 'FAM138A', 'AL627309.1', 'AL627309.3', 'AL669831.5',
       'FAM87B', 'LINC00115', 'FAM41C', 'AL645608.7', 'AL645608.3',
       ...
       'AC011043.2', 'AL592183.1', 'AC007325.1', 'AC007325.4', 'AC007325.2',
       'AL354822.1', 'AC023491.2', 'AC004556.1', 'AC240274.1', 'AC213203.1'],
      dtype='object', length=23747)

In [10]:
from scipy.sparse import issparse
import numpy as np

max_none_zore=5000
# adata = train_adata
idx = adata.obs_names.tolist()
col = adata.var_names.tolist()
if issparse(adata.X):
    gexpr_feature = adata.X.toarray()
else:
    gexpr_feature = adata.X
if max_none_zore:
    none_zero = gexpr_feature > 0
    none_zero_num = none_zero.sum(1)
    index = np.argwhere(none_zero_num > max_none_zore).reshape(-1)
    for i in index:
        none_zero_index = np.argwhere(none_zero[i]).reshape(-1)
        np.random.shuffle(none_zero_index)
        mask_num = none_zero_num[i] - max_none_zore
        mask_index = none_zero_index[0: mask_num]
        gexpr_feature[i][mask_index] = 0