In [1]:
import numpy as np
import scanpy as sc




# Create a sample data for Norman perturbation dataset from GEARS

This notebook creates a sample dataset with all the perturbations from the Norman dataset from GEARS, and only a few cells per perturbation to create a small example of the dataset

In [2]:
adata = sc.read_h5ad('/dccstor/bmfm-targets/data/omics/transcriptome/scRNA/finetune/Perturbation/GEARS/norman_scgpt_split.h5ad')

In [3]:
adata.obs['condition'].nunique()

277

In [3]:
ctrl_samples = adata[adata.obs['condition'] == 'ctrl']
print(len(ctrl_samples))
adata_to_downsample = adata[adata.obs['condition'] != 'ctrl']

7353


In [4]:
sampled_indices_ctrl = np.random.choice(ctrl_samples.obs_names, size=500, replace=False)
ctrl_samples =  ctrl_samples[sampled_indices_ctrl].copy()

sampled_indices = (
    adata_to_downsample.obs.groupby('condition')
    .apply(lambda x: x.sample(n=min(5, len(x)), random_state=42))  # sample up to 5 or less if fewer available
    .index.get_level_values(1)  # Get the index of the rows after grouping
)

# Subset the AnnData object with the sampled indices
adata_downsampled = adata[sampled_indices].copy()
adata_downsampled = ctrl_samples.concatenate(adata_downsampled)

  adata_to_downsample.obs.groupby('condition')
  .apply(lambda x: x.sample(n=min(5, len(x)), random_state=42))  # sample up to 5 or less if fewer available
  adata_downsampled = ctrl_samples.concatenate(adata_downsampled)


In [5]:
adata_downsampled.obs['condition'].nunique()

277

In [10]:
dev_cells = adata_downsampled[adata_downsampled.obs['scgpt_split'] == 'dev']
dev_cells.obs['condition'].nunique()

31

In [7]:
print(len(adata_downsampled))
print(len(adata))

1880
89357


In [8]:
adata_downsampled.write_h5ad('/dccstor/bmfm-targets/data/omics/transcriptome/scRNA/finetune/Perturbation/GEARS/norman_GEARS_downsampled.h5ad')

In [7]:
adata_downsampled.obs['condition'].unique()

['ctrl', 'AHR+FEV', 'AHR+KLF1', 'AHR+ctrl', 'ARID1A+ctrl', ..., 'ZC3HAV1+CEBPE', 'ZC3HAV1+HOXC13', 'ZC3HAV1+ctrl', 'ZNF318+FOXL2', 'ZNF318+ctrl']
Length: 277
Categories (277, object): ['AHR+FEV', 'AHR+KLF1', 'AHR+ctrl', 'ARID1A+ctrl', ..., 'ctrl+UBASH3A', 'ctrl+UBASH3B', 'ctrl+ZBTB1', 'ctrl+ZBTB25']

In [8]:
sample_test_data = adata_downsampled[adata_downsampled.obs['condition'].isin(['ctrl','AHR+FEV','AHR+KLF1', 'AHR+ctrl', 'ZC3HAV1+CEBPE', 'ctrl+UBASH3A'])]

In [9]:
sample_test_data.obs['condition'].nunique()

6

In [10]:
sample_test_data.write_h5ad('/dccstor/bmfm-targets/users/liransz/code/bmfm-targets/bmfm_targets/tests/resources/finetune/norman/norman.h5ad')