In [2]:
import memento
import pandas as pd

import cellxgene_census

In [3]:
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)

ModuleNotFoundError: No module named 'itables'

In [4]:
census = cellxgene_census.open_soma(census_version="2023-12-15")

In [5]:
# guo, disease == "COVID-19"
dataset_guo = "ae5341b8-60fb-4fac-86db-86e49ee66287"
# aruna
dataset_aruna = "59b69042-47c2-47fd-ad03-d21beb99818f"
# wilk, slide-seq
dataset_wilk = "055ca631-6ffb-40de-815e-b931e10718c0"

cell_type_monocyte = "classical monocyte"
cell_type_t_cell = "central memory CD4-positive, alpha-beta T cell"
cell_type_nk = "natural killer cell"

## Perform 5-10 more concordance comparisons with a controlled environment 

Steps:
- Look for example cases
   - 1-2 known comparisons, single-dataset single-donor, no batch effects.
   - 2-4 known comparisons, simple batch-effects with multi-donors but no multi-assay nor multi-dataset.
   - 2-4 known comparisons, complex batch-effects with multi-datasets and multi-assay.
- For each example
   - Run vanilla memento
   - Run pre-comuputed memento
   - Perform comparison of p-values between the two versions as in plot below
- Validation passes if there is linear correlation, with spearman > 0.9
- Brownie points if there is a high overlap between between significant genes at p < 0.001, do hypergeometric test

### Example cases

#### 1-2 known comparisons, single-dataset single-donor, no batch effects.

**a) Single-dataset, single-donor, no batch effects.**

- Collection: A Web Portal and Workbench for Biological Dissection of Single Cell COVID-19 Host Responses
- Dataset: Individual Single-Cell RNA-seq PBMC Data from Arunachalam et al.
   - Assay: 10X
- Comparison:
   - classical monocytes vs T-cells in one donor


In [6]:
datasets = [dataset_aruna]
donor_id = "cov17"
disease = "normal"
cell_types = [cell_type_monocyte, cell_type_t_cell]

adata = cellxgene_census.get_anndata(
    census=census,
    organism="homo_sapiens",
    obsm_layers=["scvi"],
    obs_value_filter=f"dataset_id in {datasets} and disease == '{disease}' and cell_type in {cell_types} and donor_id == '{donor_id}'",
)

In [6]:
# adata.obs["donor_id"].value_counts()

In [7]:
# scanpy.pp.neighbors(adata, use_rep="scvi")
# scanpy.tl.umap(adata)
# scanpy.pl.umap(adata, color=["cell_type"])

Running memento vanilla

In [7]:
# Setup

# Assuming sequenced to 50%, dataset specific number
adata.var.index = adata.var["feature_name"].values
adata.obs["q"] = 0.15


# Classical monocyte encoded as 1
adata.obs["treatment"] = (adata.obs["cell_type"] == cell_types[0]).astype(int)

# Setup memento
memento.setup_memento(adata, q_column="q", trim_percent=0.1)  # trim_percent tunes cell size calculation
memento.create_groups(adata, label_columns=["treatment"])
memento.compute_1d_moments(adata, min_perc_group=0.9)
group_metadata = memento.get_groups(adata)

treatment_df = group_metadata[["treatment"]]

memento.ht_1d_moments(
    adata,
    # covariate=covariate_df,
    treatment=treatment_df,
    num_boot=5000,
    verbose=1,
    num_cpus=12,
    resample_rep=False,
    approx=False,
)

result = memento.get_1d_ht_result(adata)

  df[col] = pd.to_numeric(df[col], errors='ignore')
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    3.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    4.7s
[Parallel(n_jobs=12)]: Done 576 tasks      | elapsed:    8.7s
[Parallel(n_jobs=12)]: Done 1276 tasks      | elapsed:   17.8s
[Parallel(n_jobs=12)]: Done 2176 tasks      | elapsed:   29.1s
[Parallel(n_jobs=12)]: Done 3276 tasks      | elapsed:   41.6s
[Parallel(n_jobs=12)]: Done 4264 tasks      | elapsed:   56.4s
[Parallel(n_jobs=12)]: Done 5312 out of 5335 | elapsed:  1.1min remaining:    0.3s
[Parallel(n_jobs=12)]: Done 5335 out of 5335 | elapsed:  1.2min finished


In [9]:
# result

**b) Single-dataset, single-donor, no batch effects.**

- Collection: A Web Portal and Workbench for Biological Dissection of Single Cell COVID-19 Host Responses
- Dataset: Individual Single-Cell RNA-seq PBMC Data from Arunachalam et al.
   - Assay: 10X
- Comparison:
   - classical monocytes vs natural killer cells in one donor


In [8]:
datasets = [dataset_aruna]
donor_id = "cov17"
disease = "normal"
cell_types = [cell_type_monocyte, cell_type_nk]

adata = cellxgene_census.get_anndata(
    census=census,
    organism="homo_sapiens",
    obsm_layers=["scvi"],
    obs_value_filter=f"dataset_id in {datasets} and disease == '{disease}' and cell_type in {cell_types} and donor_id == '{donor_id}'",
)

In [11]:
# adata.obs["donor_id"].value_counts()

In [12]:
# scanpy.pp.neighbors(adata, use_rep="scvi")
# scanpy.tl.umap(adata)
# scanpy.pl.umap(adata, color=["cell_type"])

In [9]:
# Setup

# Assuming sequenced to 50%, dataset specific number
adata.var.index = adata.var["feature_name"].values
adata.obs["q"] = 0.15


# Classical monocyte encoded as 1
adata.obs["treatment"] = (adata.obs["cell_type"] == cell_types[0]).astype(int)

# Setup memento
memento.setup_memento(adata, q_column="q", trim_percent=0.1)  # trim_percent tunes cell size calculation
memento.create_groups(adata, label_columns=["treatment"])
memento.compute_1d_moments(adata, min_perc_group=0.9)
group_metadata = memento.get_groups(adata)

treatment_df = group_metadata[["treatment"]]

memento.ht_1d_moments(
    adata,
    # covariate=covariate_df,
    treatment=treatment_df,
    num_boot=5000,
    verbose=1,
    num_cpus=12,
    resample_rep=False,
    approx=False,
)

result = memento.get_1d_ht_result(adata)

  df[col] = pd.to_numeric(df[col], errors='ignore')
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    2.9s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    4.8s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    8.0s
[Parallel(n_jobs=12)]: Done 1108 tasks      | elapsed:   18.2s
[Parallel(n_jobs=12)]: Done 2008 tasks      | elapsed:   31.4s
[Parallel(n_jobs=12)]: Done 3108 tasks      | elapsed:   46.8s
[Parallel(n_jobs=12)]: Done 4300 tasks      | elapsed:  1.1min
[Parallel(n_jobs=12)]: Done 5224 out of 5247 | elapsed:  1.3min remaining:    0.4s
[Parallel(n_jobs=12)]: Done 5247 out of 5247 | elapsed:  1.3min finished


In [14]:
# result

#### 2-4 known comparisons, simple batch-effects with multi-datasets but no multi-assay.

**a) Single-dataset, two donors, controlling for donors**

- Collection: A Web Portal and Workbench for Biological Dissection of Single Cell COVID-19 Host Responses
- Dataset: Individual Single-Cell RNA-seq PBMC Data from Arunachalam et al.
   - Assay: 10X
- Comparison:
   - classical monocytes vs T-cells in two donors, including donors as covariates.


In [15]:
datasets = [dataset_aruna]
donor_id = ["cov17", "cov18"]
disease = "normal"
cell_types = [cell_type_monocyte, cell_type_nk]

adata = cellxgene_census.get_anndata(
    census=census,
    organism="homo_sapiens",
    obsm_layers=["scvi"],
    obs_value_filter=f"dataset_id in {datasets} and disease == '{disease}' and cell_type in {cell_types} and donor_id in {donor_id}",
)

In [16]:
# adata.obs["donor_id"].value_counts()

In [17]:
# scanpy.pp.neighbors(adata, use_rep="scvi")
# scanpy.tl.umap(adata)
# scanpy.pl.umap(adata, color=["cell_type"])
# scanpy.pl.umap(adata, color=["donor_id"])

In [18]:
# Setup

# Assuming sequenced to 50%, dataset specific number
adata.var.index = adata.var["feature_name"].values
adata.obs["q"] = 0.15


# Classical monocyte encoded as 1
adata.obs["treatment"] = (adata.obs["cell_type"] == cell_types[0]).astype(int)

# Setup memento
memento.setup_memento(adata, q_column="q", trim_percent=0.1)  # trim_percent tunes cell size calculation
memento.create_groups(adata, label_columns=["treatment", "donor_id"])
memento.compute_1d_moments(adata, min_perc_group=0.9)
group_metadata = memento.get_groups(adata)

treatment_df = group_metadata[["treatment"]]
covariate_df = pd.get_dummies(group_metadata[["donor_id"]], drop_first=True).astype(float)
covariate_df -= covariate_df.mean()  # covariates for Lin's estimator

# Include interactions
covariate_df["interaction"] = treatment_df.iloc[:, 0] * covariate_df.iloc[:, 0]

memento.ht_1d_moments(
    adata,
    covariate=covariate_df,
    treatment=treatment_df,
    num_boot=5000,
    verbose=1,
    num_cpus=12,
    resample_rep=False,
    approx=False,
)

result = memento.get_1d_ht_result(adata)

  df[col] = pd.to_numeric(df[col], errors='ignore')
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  29 tasks      | elapsed:    0.7s
[Parallel(n_jobs=12)]: Done 328 tasks      | elapsed:    5.4s
[Parallel(n_jobs=12)]: Done 828 tasks      | elapsed:   14.1s
[Parallel(n_jobs=12)]: Done 1528 tasks      | elapsed:   26.8s
[Parallel(n_jobs=12)]: Done 2428 tasks      | elapsed:   42.1s
[Parallel(n_jobs=12)]: Done 3384 tasks      | elapsed:  1.0min
[Parallel(n_jobs=12)]: Done 4034 tasks      | elapsed:  1.2min
[Parallel(n_jobs=12)]: Done 4784 tasks      | elapsed:  1.4min
[Parallel(n_jobs=12)]: Done 4834 out of 4857 | elapsed:  1.4min remaining:    0.4s
[Parallel(n_jobs=12)]: Done 4857 out of 4857 | elapsed:  1.4min finished


In [19]:
result

gene,tx,de_coef,de_se,de_pval,dv_coef,dv_se,dv_pval
Loading... (need help?),,,,,,,


**b) Single-dataset, four donors, controlling for donors**

- Collection: A Web Portal and Workbench for Biological Dissection of Single Cell COVID-19 Host Responses
- Dataset: Individual Single-Cell RNA-seq PBMC Data from Arunachalam et al.
   - Assay: 10X
- Comparison:
   - classical monocytes vs T-cells in four donors, including donors as covariates.

In [20]:
datasets = [dataset_aruna]
donor_id = ["cov17", "cov18", "cov07", "cov08", "cov09"]
disease = "normal"
cell_types = [cell_type_monocyte, cell_type_nk]


adata = cellxgene_census.get_anndata(
    census=census,
    organism="homo_sapiens",
    obsm_layers=["scvi"],
    obs_value_filter=f"dataset_id in {datasets} and disease == '{disease}' and cell_type in {cell_types} and donor_id in {donor_id}",
)

In [21]:
# adata.obs["donor_id"].value_counts()

In [22]:
# scanpy.pp.neighbors(adata, use_rep="scvi")
# scanpy.tl.umap(adata)
# scanpy.pl.umap(adata, color=["cell_type"])
# scanpy.pl.umap(adata, color=["donor_id"])

In [23]:
# Setup

# Assuming sequenced to 50%, dataset specific number
adata.var.index = adata.var["feature_name"].values
adata.obs["q"] = 0.15


# Classical monocyte encoded as 1
adata.obs["treatment"] = (adata.obs["cell_type"] == cell_types[0]).astype(int)

# Setup memento
memento.setup_memento(adata, q_column="q", trim_percent=0.1)  # trim_percent tunes cell size calculation
memento.create_groups(adata, label_columns=["treatment", "donor_id"])
memento.compute_1d_moments(adata, min_perc_group=0.9)
group_metadata = memento.get_groups(adata)

treatment_df = group_metadata[["treatment"]]
covariate_df = pd.get_dummies(group_metadata[["donor_id"]], drop_first=True).astype(float)
covariate_df -= covariate_df.mean()  # covariates for Lin's estimator

# Include interactions
covariate_df["interaction"] = treatment_df.iloc[:, 0] * covariate_df.iloc[:, 0]

memento.ht_1d_moments(
    adata,
    covariate=covariate_df,
    treatment=treatment_df,
    num_boot=5000,
    verbose=1,
    num_cpus=12,
    resample_rep=False,
    approx=False,
)

result = memento.get_1d_ht_result(adata)

  df[col] = pd.to_numeric(df[col], errors='ignore')
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  28 tasks      | elapsed:    0.8s
[Parallel(n_jobs=12)]: Done 328 tasks      | elapsed:    7.2s
[Parallel(n_jobs=12)]: Done 828 tasks      | elapsed:   18.9s
[Parallel(n_jobs=12)]: Done 1528 tasks      | elapsed:   35.2s
[Parallel(n_jobs=12)]: Done 2428 tasks      | elapsed:   55.7s
[Parallel(n_jobs=12)]: Done 3204 tasks      | elapsed:  1.3min
[Parallel(n_jobs=12)]: Done 3854 tasks      | elapsed:  1.5min
[Parallel(n_jobs=12)]: Done 4297 out of 4297 | elapsed:  1.7min finished


In [24]:
# result

**c) Single-dataset COVID, two donors, controlling for donors**

- Collection: A Web Portal and Workbench for Biological Dissection of Single Cell COVID-19 Host Responses
- Dataset: Individual Single-Cell RNA-seq PBMC Data from Guo et al.
   - Assay: 10X
- Comparison:
   - classical monocytes vs T-cells in two donors, including donors as covariates.

In [25]:
datasets = [dataset_guo]
donor_id = ["P1", "P2"]
disease = "COVID-19"
cell_types = [cell_type_monocyte, cell_type_nk]

adata = cellxgene_census.get_anndata(
    census=census,
    organism="homo_sapiens",
    obsm_layers=["scvi"],
    obs_value_filter=f"dataset_id in {datasets} and disease == '{disease}' and cell_type in {cell_types} and donor_id in {donor_id}",
)

In [26]:
# adata.obs["donor_id"].value_counts()

In [27]:
# scanpy.pp.neighbors(adata, use_rep="scvi")
# scanpy.tl.umap(adata)
# scanpy.pl.umap(adata, color=["cell_type"])
# scanpy.pl.umap(adata, color=["donor_id"])

In [28]:
# Setup

# Assuming sequenced to 50%, dataset specific number
adata.var.index = adata.var["feature_name"].values
adata.obs["q"] = 0.15


# Classical monocyte encoded as 1
adata.obs["treatment"] = (adata.obs["cell_type"] == cell_types[0]).astype(int)

# Setup memento
memento.setup_memento(adata, q_column="q", trim_percent=0.1)  # trim_percent tunes cell size calculation
memento.create_groups(adata, label_columns=["treatment", "donor_id"])
memento.compute_1d_moments(adata, min_perc_group=0.9)
group_metadata = memento.get_groups(adata)

treatment_df = group_metadata[["treatment"]]
covariate_df = pd.get_dummies(group_metadata[["donor_id"]], drop_first=True).astype(float)
covariate_df -= covariate_df.mean()  # covariates for Lin's estimator

# Include interactions
covariate_df["interaction"] = treatment_df.iloc[:, 0] * covariate_df.iloc[:, 0]

memento.ht_1d_moments(
    adata,
    covariate=covariate_df,
    treatment=treatment_df,
    num_boot=5000,
    verbose=1,
    num_cpus=12,
    resample_rep=False,
    approx=False,
)

result = memento.get_1d_ht_result(adata)

  df[col] = pd.to_numeric(df[col], errors='ignore')
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  29 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 328 tasks      | elapsed:    4.2s
[Parallel(n_jobs=12)]: Done 828 tasks      | elapsed:   10.3s
[Parallel(n_jobs=12)]: Done 1528 tasks      | elapsed:   19.8s
[Parallel(n_jobs=12)]: Done 2428 tasks      | elapsed:   32.1s
[Parallel(n_jobs=12)]: Done 2887 out of 2910 | elapsed:   38.3s remaining:    0.3s
[Parallel(n_jobs=12)]: Done 2910 out of 2910 | elapsed:   38.5s finished


In [29]:
# result