In [1]:
from perturbench.dataset import PerturbationDataset
from perturbench.scenarios import RandomSplitScenario
from perturbench.models import RandomModel
from perturbench.metrics import AverageDifferenceMetric
from perturbench.benchmark import PerturbationBenchmark

import numpy as np
import scipy as sp
import pandas as pd
import anndata as ad
import torch

# End-to-end pipeline for benchmarking perturbation predictions using perturbench

## 1. Prepare dataset

We will prepare an example dataset with 3 genetic perturbations, and a control condition. 6,000 cells total, with sex as a covariate. We could also use a preprepared dataset.

In [2]:
n_cells = 4_000
n_genes = 20_000

raw_counts = np.random.randint(low = 0, high = 100, size = (n_cells, n_genes))
raw_counts_sparse = sp.sparse.csr_matrix(raw_counts)
perturbations = (
    ["GENETIC:MYC"] * 1000
    + ["GENETIC:AKT"] * 1000
    + ["GENETIC:PD1"] * 1000
    + [None] * 1000
)
sex = np.random.choice(["male", "female"], size = n_cells)
cell_type = np.random.choice(["cell_type1", "cell_type2"], size = n_cells)

anndata = ad.AnnData(raw_counts_sparse, obs = pd.DataFrame({"perturbation": perturbations, "sex": sex, "cell_type": cell_type}))



In [3]:
perturbation_dataset = PerturbationDataset(anndata, "perturbation", ["sex", "cell_type"], name="Example perturbation dataset", description="Example dataset with 3 genetic perturbations, and a control condition. 6,000 cells total, with sex as a covariate.")

In [4]:
perturbation_dataset.anndata()

AnnData object with n_obs × n_vars = 4000 × 20000
    obs: 'sex', 'cell_type', 'perturbation'

## 2. Declare a scenario

How should we define train-test splits? First step will be to create a scenario.

In [5]:
my_favourite_scenario = RandomSplitScenario()

## 3. Run model training

Now let's pick the model we would like to benchmark! We'll use a simple random example (shouldn't perform well!)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random_model = RandomModel(device=device)

## 4. Choose our metrics

Now we need to select which metrics we would like to use! Let's do a simple average difference between ground truth and prediction.

In [7]:
my_favourite_metric = AverageDifferenceMetric()

## 5. Finally, register this all with a benchmark object

Now, let's register this all and check what's going to be run!

In [8]:
benchmark = PerturbationBenchmark()

# register our datasets

benchmark.register_dataset(perturbation_dataset)

# register scenarios

benchmark.register_scenario(my_favourite_scenario)

# register models

for model in [random_model]:
    benchmark.register_model(model)
    
# register metrics

for metric in [my_favourite_metric]:
    benchmark.register_metric(metric)


Check what's been registered:

In [9]:
benchmark.registered()

OK, we're happy with that plan! Now let's run the benchmarking :)

In [None]:
benchmark.run()

And finally let's view the results as a table!

In [None]:
benchmark.summary()