In [1]:
import numpy as np
import scanpy as sc
import anndata as ad
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib import cm
import re
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 300

# Import Data

In [5]:
adata = ad.read_h5ad('/data/rudensky/EYW/SIG07/scanpy_outs/SIG07_doublets_CR_RNA_zscore.h5ad')
adata.X = adata.layers['log1p_norm'].copy()

In [6]:
# replace linker-1 and linker-2 with just linker
adata.obs['ligand_call_oBC_CR'] = adata.obs['ligand_call_oBC_CR'].str.replace(
    r'linker-(1|2)', 'linker', regex=True)
print(adata.obs['ligand_call_oBC_CR'].unique())

['IL4_linker' 'IL4_IL12' 'IL12_IFNA' 'IL4_IFNA' 'IL12_linker' 'IL12_IL6'
 'IL2_TNF' 'IL21_linker' 'IFNA_linker' 'IL2_linker' 'IL2_IL6' 'IL4_IL6'
 'TNF_linker' 'IL2_IL12' 'IL2_IL4' 'IFNA_TNF' 'IL6_linker' 'IL4_IL21'
 'IL4_IL27' 'linker_linker' 'IL27_linker' 'IL27_TNF' 'IFNA_IL27'
 'IL6_IL21' 'IL2_IL27' 'IL4_TNF' 'IL6_TNF' 'IL2_IL21' 'IL6_IFNA'
 'IL6_IL27' 'IL2_IFNA' 'IL21_TNF' 'IFNA_IL21' 'IL21_IL27' 'IL12_TNF'
 'IL12_IL27' 'IL12_IL21']


In [7]:
# print number of cells per category
adata.obs['ligand_call_oBC_CR'].value_counts()

ligand_call_oBC_CR
TNF_linker       6643
IL4_linker       6527
IL2_linker       5895
IL27_linker      5523
IL6_linker       4026
IFNA_linker      3986
IL21_linker      3345
linker_linker    3111
IL4_TNF          2795
IL2_TNF          2769
IL27_TNF         2709
IL2_IL4          2619
IL12_linker      2617
IL4_IL27         2157
IL2_IL27         2149
IFNA_TNF         2064
IL6_TNF          1950
IL2_IL6          1900
IL2_IFNA         1834
IL4_IFNA         1827
IL4_IL6          1815
IFNA_IL27        1747
IL6_IL27         1711
IL21_TNF         1481
IL2_IL21         1443
IL12_IL27        1424
IL4_IL21         1413
IL12_TNF         1359
IFNA_IL21        1286
IL4_IL12         1269
IL6_IFNA         1175
IL12_IFNA        1101
IL21_IL27        1075
IL6_IL21          947
IL2_IL12          844
IL12_IL21         818
IL12_IL6          572
Name: count, dtype: int64

# Psuedobulk by mean average
Here, I will do pseudobulk by just averaging the log1p normalized counts between groups.

In [8]:
def mean_aggr_adata(adata, aggregate_columns=['ligand_call_oBC_CR']):
    """
    Aggregates an AnnData object by calculating mean expressions for groups defined by `aggregate_columns`.

    Parameters:
    -----------
    adata : AnnData
        Input AnnData object.
    aggregate_columns : list
        List of columns in `adata.obs` used to group and aggregate the data.

    Returns:
    --------
    AnnData
        A new AnnData object with aggregated (mean) expressions and updated `obs` and `var`.
    """
    # Create a DataFrame from the AnnData object
    df = pd.DataFrame(adata.X.toarray() if not isinstance(adata.X, np.ndarray) else adata.X,
                      index=adata.obs.index, 
                      columns=adata.var_names)
    
    # Add aggregate columns to the DataFrame
    for col in aggregate_columns:
        df[col] = adata.obs[col]
    
    # Group by the specified columns and compute mean expression
    mean_aggregated = df.groupby(aggregate_columns, observed=True).mean()
    
    # Extract the index of the grouped DataFrame (group identifiers)
    new_obs = mean_aggregated.index.to_frame(index=False)  # Convert MultiIndex to DataFrame
    
    # Convert the mean values back to a matrix
    new_X = mean_aggregated.values
    
    # Create a new AnnData object
    aggregated_adata = ad.AnnData(
        X=new_X,
        obs=new_obs,
        var=adata.var.copy()  # Retain the original gene information
    )
    
    return aggregated_adata

In [10]:
adataPB = mean_aggr_adata(adata, aggregate_columns=['ligand_call_oBC_CR','group_call_CR'])



In [12]:
adataPB

AnnData object with n_obs × n_vars = 95 × 10440
    obs: 'ligand_call_oBC_CR', 'group_call_CR'
    var: 'gene_ids', 'feature_types', 'genome', 'pattern', 'read', 'sequence', 'target_gene_id', 'target_gene_name', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mean', 'std', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'

In [13]:
adataPB.write(filename="/data/rudensky/EYW/SIG07/scanpy_outs/SIG07_doublets_CR_RNA_log1p_mean_pseudobulk.h5ad")