In [2]:
import os
from pathlib import Path

import numpy as np
import scanpy as sc

  from pkg_resources import get_distribution, DistributionNotFound


### Load internal split

In [4]:
data_folder = Path(os.environ["BMFM_TARGETS_VCC_DATA"]) / "internal_split"
data_folder

PosixPath('/dccstor/bmfm-targets/data/omics/transcriptome/scRNA/finetune/Perturbation/vcc/internal_split')

In [4]:
train_data = sc.read_h5ad(data_folder / "train_data.h5ad")
test_data = sc.read_h5ad(data_folder / "test_data.h5ad")

In [5]:
train_data

AnnData object with n_obs × n_vars = 120360 × 18080
    obs: 'target_gene', 'guide_id', 'batch'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mt', 'ribo', 'n_cells'

In [6]:
train_data.obs["target_gene"].nunique()

100

In [7]:
train_data.obs["target_gene"].value_counts()

target_gene
TMSB4X     4760
TADA1      4035
IGF2R      3109
NCK2       2929
MED13      2787
           ... 
BRD9         67
DNAJA3       65
OXA1L        63
RNF20        47
ATP6V0C      33
Name: count, Length: 100, dtype: int64

In [8]:
train_data.X.toarray()

array([[0., 0., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 4., 2.],
       ...,
       [0., 4., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 2., 1.]],
      shape=(120360, 18080), dtype=float32)

In [11]:
test_data

AnnData object with n_obs × n_vars = 100913 × 18080
    obs: 'target_gene', 'guide_id', 'batch'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mt', 'ribo', 'n_cells'

In [9]:
test_data.obs["target_gene"].nunique()

51

In [10]:
test_data.obs["target_gene"].value_counts()

target_gene
non-targeting    38176
PRCP              4331
HIRA              3407
MED12             2766
STAT1             2493
USP22             2491
ATP1B1            2121
KAT2A             2120
NISCH             2101
WFS1              2043
MAP3K7            2019
HMGN1             2004
RNF2              1844
SMARCA5           1795
TRAM2             1756
EIF4B             1585
STAG2             1573
DHCR24            1420
NDUFB4            1413
DHX36             1392
DAXX              1332
NIT1              1287
STX4              1249
PMS1              1100
ZNF286A           1059
MAST2             1029
MAU2              1021
IRF3               985
MKI67              968
DLG5               864
CLDN6              864
SLC25A3            805
METTL17            790
SMAGP              788
TAZ                749
C1QBP              711
CALM3              681
RRM1               638
XRCC4              591
TRAPPC6A           589
SIN3B              586
SV2A               580
METTL3             498

Create `internal_split` column by concatenating the two datasets, named subset `train`, `dev`

In [14]:
train = train_data.copy()
test  = test_data.copy()

train.obs["internal_split"] = "train"
test.obs["internal_split"] = np.where(
    test.obs["target_gene"].astype(str).eq("non-targeting"),
    None,    # missing / no split for non-targeting
    "dev"
)

In [20]:
test.obs["internal_split"].value_counts(dropna=False)

internal_split
dev     62737
None    38176
Name: count, dtype: int64

In [25]:

adata = sc.concat(
    [train, test],
    join="inner",             # same structure
    index_unique=None
)

print(adata.obs["internal_split"].value_counts(dropna=False))

internal_split
train    120360
dev       62737
None      38176
Name: count, dtype: int64


Create another split column, `internal_split_for_test` with same values but with names `train, test` to be used in test tasks

In [26]:
adata.obs["internal_split_for_test"] = adata.obs["internal_split"].copy()
adata.obs["internal_split_for_test"] = adata.obs["internal_split_for_test"].replace("dev", "test")
adata.obs["internal_split_for_test"].value_counts(dropna=False)


internal_split_for_test
train    120360
test      62737
None      38176
Name: count, dtype: int64

In [29]:
adata.shape

(221273, 18080)

In [31]:
adata.obs

Unnamed: 0,target_gene,guide_id,batch,internal_split,internal_split_for_test
AAACAAGCAACCTTGTACTTTAGG-Flex_1_01,CHMP3,CHMP3_P1P2_A|CHMP3_P1P2_B,Flex_1_01,train,train
AAACAAGCATTGCCGCACTTTAGG-Flex_1_01,AKT2,AKT2_P1P2_A|AKT2_P1P2_B,Flex_1_01,train,train
AAACCAATCCCTCGCTACTTTAGG-Flex_1_01,TMSB4X,TMSB4X_P1_A|TMSB4X_P1_B,Flex_1_01,train,train
AAACCAATCTAAATCCACTTTAGG-Flex_1_01,KLF10,KLF10_P2_A|KLF10_P2_B,Flex_1_01,train,train
AAACGGGCACCTAAGAACTTTAGG-Flex_1_01,TARBP2,TARBP2_P1P2_A|TARBP2_P1P2_B,Flex_1_01,train,train
...,...,...,...,...,...
TTTGCTCTCCAATTACATTCGGTT-Flex_3_16,TRAM2,TRAM2_P1P2_A|TRAM2_P1P2_B,Flex_3_16,dev,test
TTTGCTGAGTAACTTCATTCGGTT-Flex_3_16,non-targeting,non-targeting_00057|non-targeting_00277,Flex_3_16,,
TTTGGACGTGGTGCAGATTCGGTT-Flex_3_16,non-targeting,non-targeting_00035|non-targeting_03439,Flex_3_16,,
TTTGTGAGTAGTAGCAATTCGGTT-Flex_3_16,KDM1A,KDM1A_P1P2_A|KDM1A_P1P2_B,Flex_3_16,dev,test


In [None]:
#sc.write(data_folder / "train_and_test_09162025.h5ad", adata)

In [29]:
final_adata = sc.read_h5ad(data_folder / "train_and_test_09162025.h5ad")

In [30]:
final_adata.obs

Unnamed: 0,target_gene,guide_id,batch,internal_split,internal_split_for_test
AAACAAGCAACCTTGTACTTTAGG-Flex_1_01,CHMP3,CHMP3_P1P2_A|CHMP3_P1P2_B,Flex_1_01,train,train
AAACAAGCATTGCCGCACTTTAGG-Flex_1_01,AKT2,AKT2_P1P2_A|AKT2_P1P2_B,Flex_1_01,train,train
AAACCAATCCCTCGCTACTTTAGG-Flex_1_01,TMSB4X,TMSB4X_P1_A|TMSB4X_P1_B,Flex_1_01,train,train
AAACCAATCTAAATCCACTTTAGG-Flex_1_01,KLF10,KLF10_P2_A|KLF10_P2_B,Flex_1_01,train,train
AAACGGGCACCTAAGAACTTTAGG-Flex_1_01,TARBP2,TARBP2_P1P2_A|TARBP2_P1P2_B,Flex_1_01,train,train
...,...,...,...,...,...
TTTGCTCTCCAATTACATTCGGTT-Flex_3_16,TRAM2,TRAM2_P1P2_A|TRAM2_P1P2_B,Flex_3_16,dev,test
TTTGCTGAGTAACTTCATTCGGTT-Flex_3_16,non-targeting,non-targeting_00057|non-targeting_00277,Flex_3_16,,
TTTGGACGTGGTGCAGATTCGGTT-Flex_3_16,non-targeting,non-targeting_00035|non-targeting_03439,Flex_3_16,,
TTTGTGAGTAGTAGCAATTCGGTT-Flex_3_16,KDM1A,KDM1A_P1P2_A|KDM1A_P1P2_B,Flex_3_16,dev,test


### View processed file (after running transformations) 

In [31]:
final_processed_adata = sc.read_h5ad(data_folder / "train_and_test_09162025_processed.h5ad")

In [32]:
final_processed_adata.shape

(221273, 18080)

In [33]:
type(final_processed_adata.X) 

scipy.sparse._csr.csr_matrix

In [34]:
final_processed_adata.X.toarray()

array([[0.        , 0.        , 0.69651216, ..., 0.        , 0.        ,
        0.69651216],
       [0.        , 1.1825377 , 0.        , ..., 0.        , 0.        ,
        1.1825377 ],
       [0.        , 0.8600038 , 0.        , ..., 0.        , 1.8644954 ,
        1.3154263 ],
       ...,
       [0.        , 1.4357406 , 0.        , ..., 0.        , 0.7263815 ,
        0.42774358],
       [0.        , 1.3759232 , 0.        , ..., 0.686245  , 1.0893987 ,
        1.3759232 ],
       [0.        , 1.25836   , 0.        , ..., 0.        , 0.6097012 ,
        0.        ]], shape=(221273, 18080), dtype=float32)

In [35]:
final_processed_adata.obs

Unnamed: 0,target_gene,guide_id,batch,internal_split,internal_split_for_test
AAACAAGCAACCTTGTACTTTAGG-Flex_1_01,CHMP3,CHMP3_P1P2_A|CHMP3_P1P2_B,Flex_1_01,train,train
AAACAAGCATTGCCGCACTTTAGG-Flex_1_01,AKT2,AKT2_P1P2_A|AKT2_P1P2_B,Flex_1_01,train,train
AAACCAATCCCTCGCTACTTTAGG-Flex_1_01,TMSB4X,TMSB4X_P1_A|TMSB4X_P1_B,Flex_1_01,train,train
AAACCAATCTAAATCCACTTTAGG-Flex_1_01,KLF10,KLF10_P2_A|KLF10_P2_B,Flex_1_01,train,train
AAACGGGCACCTAAGAACTTTAGG-Flex_1_01,TARBP2,TARBP2_P1P2_A|TARBP2_P1P2_B,Flex_1_01,train,train
...,...,...,...,...,...
TTTGCTCTCCAATTACATTCGGTT-Flex_3_16,TRAM2,TRAM2_P1P2_A|TRAM2_P1P2_B,Flex_3_16,dev,test
TTTGCTGAGTAACTTCATTCGGTT-Flex_3_16,Control,non-targeting_00057|non-targeting_00277,Flex_3_16,,
TTTGGACGTGGTGCAGATTCGGTT-Flex_3_16,Control,non-targeting_00035|non-targeting_03439,Flex_3_16,,
TTTGTGAGTAGTAGCAATTCGGTT-Flex_3_16,KDM1A,KDM1A_P1P2_A|KDM1A_P1P2_B,Flex_3_16,dev,test


In [36]:
final_processed_adata.obs["internal_split"].value_counts(dropna=False)

internal_split
train    120360
dev       62737
NaN       38176
Name: count, dtype: int64

In [37]:
final_processed_adata.obs["internal_split_for_test"].value_counts(dropna=False)

internal_split_for_test
train    120360
test      62737
NaN       38176
Name: count, dtype: int64

In [38]:
final_processed_adata[final_processed_adata.obs["internal_split"]=="dev"].obs["target_gene"].nunique()

50

In [39]:
final_processed_adata[final_processed_adata.obs["internal_split"]=="dev"].obs["target_gene"].value_counts()

target_gene
PRCP        4331
HIRA        3407
MED12       2766
STAT1       2493
USP22       2491
ATP1B1      2121
KAT2A       2120
NISCH       2101
WFS1        2043
MAP3K7      2019
HMGN1       2004
RNF2        1844
SMARCA5     1795
TRAM2       1756
EIF4B       1585
STAG2       1573
DHCR24      1420
NDUFB4      1413
DHX36       1392
DAXX        1332
NIT1        1287
STX4        1249
PMS1        1100
ZNF286A     1059
MAST2       1029
MAU2        1021
IRF3         985
MKI67        968
DLG5         864
CLDN6        864
SLC25A3      805
METTL17      790
SMAGP        788
TAZ          749
C1QBP        711
CALM3        681
RRM1         638
XRCC4        591
TRAPPC6A     589
SIN3B        586
SV2A         580
METTL3       498
ATM          455
MED24        455
KDM1A        352
ETV4         263
SHPRH        241
EPHB4        241
FDPS         154
TAF13        138
Name: count, dtype: int64

In [40]:
final_processed_adata[final_processed_adata.obs["internal_split_for_test"]=="test"].obs["target_gene"].nunique()

50

In [41]:
final_processed_adata[final_processed_adata.obs["internal_split_for_test"]=="test"].obs["target_gene"].value_counts()

target_gene
PRCP        4331
HIRA        3407
MED12       2766
STAT1       2493
USP22       2491
ATP1B1      2121
KAT2A       2120
NISCH       2101
WFS1        2043
MAP3K7      2019
HMGN1       2004
RNF2        1844
SMARCA5     1795
TRAM2       1756
EIF4B       1585
STAG2       1573
DHCR24      1420
NDUFB4      1413
DHX36       1392
DAXX        1332
NIT1        1287
STX4        1249
PMS1        1100
ZNF286A     1059
MAST2       1029
MAU2        1021
IRF3         985
MKI67        968
DLG5         864
CLDN6        864
SLC25A3      805
METTL17      790
SMAGP        788
TAZ          749
C1QBP        711
CALM3        681
RRM1         638
XRCC4        591
TRAPPC6A     589
SIN3B        586
SV2A         580
METTL3       498
ATM          455
MED24        455
KDM1A        352
ETV4         263
SHPRH        241
EPHB4        241
FDPS         154
TAF13        138
Name: count, dtype: int64

save only test set into a spearate file - to be used with cell-eval

In [42]:
final_processed_adata_test_only = final_processed_adata[final_processed_adata.obs["internal_split_for_test"] != "train"]

In [43]:
final_processed_adata_test_only.obs["internal_split_for_test"].value_counts(dropna=False)

internal_split_for_test
test    62737
NaN     38176
Name: count, dtype: int64

In [44]:
final_processed_adata_test_only.obs["target_gene"].nunique()

51

In [45]:
final_processed_adata_test_only.obs["target_gene"].value_counts()

target_gene
Control     38176
PRCP         4331
HIRA         3407
MED12        2766
STAT1        2493
USP22        2491
ATP1B1       2121
KAT2A        2120
NISCH        2101
WFS1         2043
MAP3K7       2019
HMGN1        2004
RNF2         1844
SMARCA5      1795
TRAM2        1756
EIF4B        1585
STAG2        1573
DHCR24       1420
NDUFB4       1413
DHX36        1392
DAXX         1332
NIT1         1287
STX4         1249
PMS1         1100
ZNF286A      1059
MAST2        1029
MAU2         1021
IRF3          985
MKI67         968
DLG5          864
CLDN6         864
SLC25A3       805
METTL17       790
SMAGP         788
TAZ           749
C1QBP         711
CALM3         681
RRM1          638
XRCC4         591
TRAPPC6A      589
SIN3B         586
SV2A          580
METTL3        498
ATM           455
MED24         455
KDM1A         352
ETV4          263
SHPRH         241
EPHB4         241
FDPS          154
TAF13         138
Name: count, dtype: int64

In [46]:
final_processed_adata_test_only.obs["target_gene"] = (
    final_processed_adata_test_only.obs["target_gene"]
    .replace("Control", "non-targeting")
)

  .replace("Control", "non-targeting")
  final_processed_adata_test_only.obs["target_gene"] = (


In [47]:
final_processed_adata_test_only.obs["target_gene"].value_counts()

target_gene
non-targeting    38176
PRCP              4331
HIRA              3407
MED12             2766
STAT1             2493
USP22             2491
ATP1B1            2121
KAT2A             2120
NISCH             2101
WFS1              2043
MAP3K7            2019
HMGN1             2004
RNF2              1844
SMARCA5           1795
TRAM2             1756
EIF4B             1585
STAG2             1573
DHCR24            1420
NDUFB4            1413
DHX36             1392
DAXX              1332
NIT1              1287
STX4              1249
PMS1              1100
ZNF286A           1059
MAST2             1029
MAU2              1021
IRF3               985
MKI67              968
DLG5               864
CLDN6              864
SLC25A3            805
METTL17            790
SMAGP              788
TAZ                749
C1QBP              711
CALM3              681
RRM1               638
XRCC4              591
TRAPPC6A           589
SIN3B              586
SV2A               580
METTL3             498

In [48]:
test_processed_file_for_cell_eval = data_folder / "train_and_test_09162025_processed_test_only.h5ad"


In [49]:
final_processed_adata_test_only.write(test_processed_file_for_cell_eval)