<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->

In [0]:
#| echo: false
#| output: asis
show_doc(create_pseudobulk_samples)

---

[source](https://github.com/cobioda/allos/blob/main/allos/metacells.py#L12){target="_blank" style="float:right; font-size:smaller"}

### create_pseudobulk_samples

>      create_pseudobulk_samples (adata:anndata._core.anndata.AnnData,
>                                 donor_column:str, cell_type_column:str,
>                                 metadata_columns:list=None)

*Create pseudobulk samples from single-cell data.

Parameters:
adata (AnnData): Annotated data matrix.
donor_column (str): Column name in `adata.obs` for donor information.
cell_type_column (str): Column name in `adata.obs` for cell type information.
metadata_columns (list, optional): List of additional metadata columns to include.

Returns:
AnnData: Combined pseudobulk samples as an AnnData object.*

In [None]:
#export
import scanpy as sc
import numpy as np
import scipy.sparse as sp


def create_metacells(adata: sc.AnnData, knn_indices: np.ndarray, n_iter: int = 50, k: int = 30) -> np.ndarray:
    """
    Create a metacell expression matrix via k-NN bagging.

    For each cell, repeatedly sample k neighbors (with replacement) from its k-NN list,
    average their expression profiles, and then average the bootstrapped results.

    Parameters
    ----------
    adata : sc.AnnData
        Annotated single-cell data.
    knn_indices : np.ndarray
        Neighbor indices from sc.pp.neighbors (shape: [n_cells, n_neighbors]).
    n_iter : int, optional
        Number of bootstrap iterations (default is 50).
    k : int, optional
        Number of neighbors to sample per iteration (default is 30).

    Returns
    -------
    np.ndarray
        Metacell expression matrix of shape (n_cells, n_genes).
    """
    # Convert the expression matrix to dense if it is sparse
    X = adata.X.toarray() if sp.issparse(adata.X) else adata.X
    n_cells, n_genes = X.shape
    metacell_expr = np.zeros((n_cells, n_genes))
    
    for i in range(n_cells):
        boot_samples = []
        for _ in range(n_iter):
            # Sample k neighbors (with replacement) from the precomputed k-NN list
            sample_idx = np.random.choice(knn_indices[i], size=k, replace=True)
            boot_samples.append(np.mean(X[sample_idx], axis=0))
        # Average the bootstrap samples to get the metacell profile for cell i
        metacell_expr[i] = np.mean(boot_samples, axis=0)
    return metacell_expr


def run_metacell_pipeline(adata: sc.AnnData,
                          annotation: str = 'your_annotation',
                          pre_neighbors: int = 30,
                          pre_n_pcs: int = 50,
                          post_neighbors: int = 15,
                          post_n_pcs: int = 20,
                          n_iter: int = 50,
                          k: int = 30) -> sc.AnnData:
    """
    Run the complete metacell pipeline on a single-cell AnnData object.

    The pipeline performs the following steps:
      1. Preprocessing:
         - Checks if normalization and log1p transformation have been applied.
           If not, a warning is issued and sc.pp.normalize_total (with target_sum=1e4)
           and sc.pp.log1p are automatically applied.
         - Selects highly variable genes and runs PCA.
      2. Computes the initial k-NN graph.
      3. Creates a metacell expression matrix via k-NN bagging.
      4. Constructs a new AnnData object from the metacell matrix.

    Parameters
    ----------
    adata : sc.AnnData
        Annotated single-cell data.
    annotation : str, optional
        Key in adata.obs to color plots (default is 'your_annotation').
    pre_neighbors : int, optional
        Number of neighbors for the initial k-NN graph (default is 30).
    pre_n_pcs : int, optional
        Number of principal components for the initial PCA (default is 50).
    post_neighbors : int, optional
        Number of neighbors for the postprocessing k-NN graph (default is 15).
    post_n_pcs : int, optional
        Number of principal components for the postprocessing PCA (default is 20).
    n_iter : int, optional
        Number of bootstrap iterations for metacell creation (default is 50).
    k : int, optional
        Number of neighbors sampled per iteration (default is 30).

    Returns
    -------
    sc.AnnData
        AnnData object containing the metacell expression matrix.
    """
    import warnings
    import scipy.sparse as sp

    # Step 1: Preprocessing - Check if normalization and log1p transformation have been applied.
    X = adata.X.toarray() if sp.issparse(adata.X) else adata.X
    # Heuristic: if the maximum value in the expression matrix is > 12, assume the data 
    # has not yet been log-transformed.
    if np.nanmax(X) > 12:
        cell_sums = np.sum(X, axis=1)
        # Check if normalization (target sum ~1e4) has been applied
        if not np.isclose(np.median(cell_sums), 1e4, rtol=0.2):
            warnings.warn("Normalization has not been applied. Automatically applying sc.pp.normalize_total with target_sum=1e4.")
            sc.pp.normalize_total(adata, target_sum=1e4)
            X = adata.X.toarray() if sp.issparse(adata.X) else adata.X
        warnings.warn("Log1p transformation has not been applied. Automatically applying sc.pp.log1p to the data.")
        sc.pp.log1p(adata)
        X = adata.X.toarray() if sp.issparse(adata.X) else adata.X

    # Select highly variable genes and run PCA.
    sc.pp.highly_variable_genes(adata, flavor='seurat', n_top_genes=2000)
    adata = adata[:, adata.var.highly_variable]
    sc.pp.pca(adata, n_comps=pre_n_pcs)
    
    # Step 2: Compute initial k-NN graph.
    sc.pp.neighbors(adata, n_neighbors=pre_neighbors, n_pcs=pre_n_pcs)
    knn_indices = adata.uns['neighbors']['indices']
    
    # Step 3: Create metacells using k-NN bagging.
    metacell_matrix = create_metacells(adata, knn_indices, n_iter=n_iter, k=k)
    
    # Step 4: Construct a new AnnData object for metacells.
    metacell_adata = sc.AnnData(X=metacell_matrix, obs=adata.obs.copy(), var=adata.var.copy())
    
    return metacell_adata