In [56]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from scviz import utils
from scviz import setup

# A = [[7,2,3],[4,np.nan,6],[10,5,np.nan],[np.nan,np.nan,2]]
# print(A)
# print("\n\n")
# column_trans = ColumnTransformer(
# [('imp_col1', SimpleImputer(strategy='mean'), [1]),
#  ('imp_col2', SimpleImputer(strategy='constant', fill_value=29), [2,3])],
# remainder='passthrough')

# print(column_trans.fit_transform(A)[:, [2,0,1]])

In [1]:
def impute(self, classes=None, layer="X", method='mean', on='protein', set_X=True, **kwargs):
    """
    Impute missing values across samples (globally or within classes) using SimpleImputer.

    Parameters:
        classes (str or list): Class columns in .obs to group by.
        layer (str): Data layer to impute from.
        method (str): 'mean', 'median', or 'min'.
        on (str): 'protein' or 'peptide'.
        set_X (bool): Whether to set .X to the imputed result.
    """
    from sklearn.impute import SimpleImputer, KNNImputer
    from scipy import sparse
    from scviz import utils


    if not self._check_data(on):
        return

    adata = self.prot if on == 'protein' else self.pep
    if layer != "X" and layer not in adata.layers:
        raise ValueError(f"Layer '{layer}' not found in .{on}.")

    impute_data = adata.layers[layer] if layer != "X" else adata.X
    was_sparse = sparse.issparse(impute_data)
    impute_data = impute_data.toarray() if was_sparse else impute_data.copy()
    original_data = impute_data.copy()

    layer_name = f"X_impute_{method}"

    if method not in {"mean", "median", "min","knn"}:
        raise ValueError(f"Unsupported method: {method}")

    if classes is None:
        # Global imputation
        if method == 'min':
            min_vals = np.nanmin(impute_data, axis=0)
            min_vals = np.where(np.isnan(min_vals), 0, min_vals)
            mask = np.isnan(impute_data)
            impute_data[mask] = np.take(min_vals, np.where(mask)[1])
        elif method == 'knn':
            n_neighbors = kwargs.get('n_neighbors', 3)
            imputer = KNNImputer(n_neighbors=n_neighbors)
            impute_data = imputer.fit_transform(impute_data)
        else:
            imputer = SimpleImputer(strategy=method)
            impute_data = imputer.fit_transform(impute_data)

        print(f"ℹ️ Global imputation using '{method}'. Layer saved as '{layer_name}'.")

    else:
        # Group-wise imputation
        if method == 'knn':
            raise ValueError("KNN imputation is not supported for group-wise imputation.")

        sample_names = utils.get_samplenames(adata, classes)
        sample_names = np.array(sample_names)
        unique_groups = np.unique(sample_names)

        for group in unique_groups:
            idx = np.where(sample_names == group)[0]
            group_data = impute_data[idx, :]

            if method == 'min':
                min_vals = np.nanmin(group_data, axis=0)
                min_vals = np.where(np.isnan(min_vals), 0, min_vals)
                mask = np.isnan(group_data)
                group_data[mask] = np.take(min_vals, np.where(mask)[1])
                imputed_group = group_data
            else:
                imputer = SimpleImputer(strategy=method)
                imputed_group = imputer.fit_transform(group_data)

            impute_data[idx, :] = imputed_group

        print(f"ℹ️ Group-wise imputation using '{method}' on class(es): {classes}. Layer saved as '{layer_name}'.")

    summary_lines = []
    if classes is None:
        num_imputed = np.sum(np.isnan(original_data) & ~np.isnan(impute_data))
        summary_lines.append(f"✅ {num_imputed} values imputed.")
    else:
        sample_names = utils.get_samplenames(adata, classes)
        sample_names = np.array(sample_names)
        unique_groups = np.unique(sample_names)

        counts_by_group = {}
        for group in unique_groups:
            idx = np.where(sample_names == group)[0]
            before = original_data[idx, :]
            after = impute_data[idx, :]
            mask = np.isnan(before) & ~np.isnan(after)
            counts_by_group[group] = np.sum(mask)

        total = sum(counts_by_group.values())
        summary_lines.append(f"✅ {total} values imputed total.")
        for group, count in counts_by_group.items():
            summary_lines.append(f"   - {group}: {count} values")

    print("\n".join(summary_lines))

    adata.layers[layer_name] = sparse.csr_matrix(impute_data) if was_sparse else impute_data

    if set_X:
        self.set_X(layer=layer_name, on=on)

    self._history.append(
        f"{on}: Imputed layer '{layer}' using '{method}' (grouped by {classes if classes else 'ALL'}). Stored in '{layer_name}'."
    )



In [93]:
from matplotlib.pylab import f


def normalize(self, classes = None, layer = "X", method = 'sum', on = 'protein', set_X = True, force = False, use_nonmissing = False, **kwargs):  
    """ 
    Normalize the data across samples (globally or within groups).

    Parameters:
    - classes (str or list): Sample-level class/grouping column(s) in .obs.
    - layer (str): Data layer to normalize from (default='X').
    - method (str): Normalization method. Options: 'sum', 'median', 'mean', 'max', 'reference_feature', 'robust_scale', 'quantile_transform'.
    - on (str): 'protein' or 'peptide'.
    - set_X (bool): Whether to set .X to the normalized result.
    - force (bool): Whether to force normalization even with bad rows.
    - use_nonmissing (bool): Whether to use only fully observed columns for normalization.
    - **kwargs: Additional arguments for normalization methods.
        (e.g., reference_columns for 'reference_feature', n_neighbors for 'knn').
        max_missing_fraction: Maximum fraction of missing values allowed in a row. Default is 0.5.

    """
    
    if not self._check_data(on):
        return

    adata = self.prot if on == 'protein' else self.pep
    if layer != "X" and layer not in adata.layers:
        raise ValueError(f"Layer {layer} not found in .{on}.")
    
    normalize_data = adata.layers[layer] if layer != "X" else adata.X
    was_sparse = sparse.issparse(normalize_data)
    normalize_data = normalize_data.toarray() if was_sparse else normalize_data.copy()
    original_data = normalize_data.copy()

    # Check for bad rows (too many missing values)
    missing_fraction = np.isnan(normalize_data).sum(axis=1) / normalize_data.shape[1]
    max_missing_fraction = kwargs.pop("max_missing_fraction", 0.5)
    bad_rows_mask = missing_fraction > max_missing_fraction

    if np.any(bad_rows_mask):
        n_bad = np.sum(bad_rows_mask)
        print(f"⚠️ {n_bad} sample(s) have >{int(max_missing_fraction*100)}% missing values.")
        print("   Suggest running `.impute()` before normalization for more stable results.")
        print("   Alternatively, try `use_nonmissing=True` to normalize using only consistently observed proteins.")
        if not force:
            print("   ➡️ Use `force=True` to proceed anyway.")
            return

    layer_name = 'X_norm_' + method
    normalize_funcs = ['sum', 'median', 'mean', 'max', 'reference_feature', 'robust_scale', 'quantile_transform']

    if method not in normalize_funcs:
        raise ValueError(f"Unsupported normalization method: {method}")

    if classes is None:
        normalize_data = self._normalize_helper(normalize_data, method, use_nonmissing=use_nonmissing, **kwargs)
        msg=f"ℹ️ Global normalization using '{method}'"
    else:
        # Group-wise normalization
        sample_names = utils.get_samplenames(adata, classes)
        sample_names = np.array(sample_names)
        unique_groups = np.unique(sample_names)

        for group in unique_groups:
            idx = np.where(sample_names == group)[0]
            group_data = normalize_data[idx, :]

            normalized_group = self._normalize_helper(group_data, method=method, use_nonmissing=use_nonmissing, **kwargs)
            normalize_data[idx, :] = normalized_group

        msg=f"ℹ️ Group-wise normalization using '{method}' on class(es): {classes}"

    if use_nonmissing and method in {'sum', 'mean', 'median', 'max'}:
        msg += f" (using only fully observed columns)"

    msg += f". Layer saved as '{layer_name}'."
    print(msg)

    # summary printout
    summary_lines = []
    if classes is None:
        summary_lines.append(f"✅ Normalized all {normalize_data.shape[0]} samples.")
    else:
        for group in unique_groups:
            count = np.sum(sample_names == group)
            summary_lines.append(f"   - {group}: {count} samples normalized")
        summary_lines.insert(0, f"✅ Normalized {normalize_data.shape[0]} samples total.")
    print("\n".join(summary_lines))            

    adata.layers[layer_name] = sparse.csr_matrix(normalize_data) if was_sparse else normalize_data

    if set_X:
        self.set_X(layer = layer_name, on = on)

    # Determine if use_nonmissing note should be added
    note = ""
    if kwargs.get("use_nonmissing", False) and method in {'sum', 'mean', 'median', 'max'}:
        note = " (using only fully observed columns)"

    self._history.append(
        f"{on}: Normalized layer {layer} using {method}{note} (grouped by {classes}). Stored in `{layer_name}`."
    )

def _normalize_helper(self, data, method, use_nonmissing, **kwargs):
    """
    Helper function for normalization methods.

    Parameters:
    - data (np.ndarray): Data to normalize.
    - method (str): Normalization method.

    Returns:
    - np.ndarray: Normalized data.
    """

    if method in {'sum', 'mean', 'median', 'max'}:
        reducer = {
                'sum': np.nansum,
                'mean': np.nanmean,
                'median': np.nanmedian,
                'max': np.nanmax
            }[method]

        if use_nonmissing:
            fully_observed_cols = ~np.isnan(data).any(axis=0)
            if not np.any(fully_observed_cols):
                raise ValueError("No fully observed columns available for normalization with `use_nonmissing=True`.")
            used_cols = np.where(fully_observed_cols)[0]
            print(f"ℹ️ Normalizing using only fully observed columns: {used_cols}")
            row_vals = reducer(data[:, fully_observed_cols], axis=1)
        else:
            row_vals = reducer(data, axis=1)

        scale = np.nanmax(row_vals) / row_vals
        scale = np.where(np.isnan(scale), 1.0, scale)
        data_norm = data * scale[:, None]

    elif method == 'reference_feature':
        # norm by reference feature: scale each row s.t. the reference column is the same across all rows (scale to max value of reference column)
        reference_columns = kwargs.get('reference_columns', [2])
        reference_method = kwargs.get('reference_method', 'median')  # default to median

        reducer_map = {
            'mean': np.nanmean,
            'median': np.nanmedian,
            'sum': np.nansum
        }

        if reference_method not in reducer_map:
            raise ValueError(f"Unsupported reference method: {reference_method}. Supported methods are: {list(reducer_map.keys())}")
        reducer = reducer_map[reference_method]

        # resolve reference column names if needed
        if isinstance(reference_columns[0], str):
            gene_to_acc, _ = self.get_gene_maps(on='protein')
            resolved = utils.resolve_accessions(self.prot, reference_columns, gene_map=gene_to_acc)
            reference_acc = [ref for ref in resolved if ref in self.prot.var.index]
            reference_columns = [self.prot.var.index.get_loc(ref) for ref in reference_acc]
            print(f"ℹ️ Normalizing using found reference columns: {reference_acc}")
            self._history.append(f"Used reference_feature normalization with resolved accessions: {resolved}")
        else:
            reference_columns = [int(ref) for ref in reference_columns]
            reference_acc = [self.prot.var.index[ref] for ref in reference_columns if ref < self.prot.shape[1]]
            print(f"ℹ️ Normalizing using reference columns: {reference_acc}")
            self._history.append(f"Used reference_feature normalization with resolved accessions: {reference_acc}")

        scaling_factors = np.nanmean(np.nanmax(data[:, reference_columns], axis=0) / (data[:, reference_columns]), axis=1)

        nan_rows = np.where(np.isnan(scaling_factors))[0]
        if nan_rows.size > 0:
            print(f"⚠️ Rows {list(nan_rows)} have all missing reference values.")
            print("   ➡️ Falling back to row median normalization for these rows.")
            
            fallback = np.nanmedian(data[nan_rows, :], axis=1)
            fallback[fallback == 0] = np.nan  # avoid division by 0
            fallback_scale = np.nanmax(fallback) / fallback
            fallback_scale = np.where(np.isnan(fallback_scale), 1.0, fallback_scale)  # default to 1.0 if all else fails

            scaling_factors[nan_rows] = fallback_scale

        scaling_factors = np.where(np.isnan(scaling_factors), np.nanmean(scaling_factors), scaling_factors)
        data_norm = data * scaling_factors[:, None]

    elif method == 'robust_scale':
        # norm by robust_scale: Center to the median and component wise scale according to the interquartile range. See sklearn.preprocessing.robust_scale for more information.
        from sklearn.preprocessing import robust_scale
        data_norm = robust_scale(data, axis=1)

    elif method == 'quantile_transform':
        # norm by quantile_transform: Transform features using quantiles information. See sklearn.preprocessing.quantile_transform for more information.
        from sklearn.preprocessing import quantile_transform
        data_norm = quantile_transform(data, axis=1)

    else:
        raise ValueError(f"Unknown method: {method}")

    return data_norm

In [94]:
import numpy as np
import pandas as pd
from anndata import AnnData
from scipy import sparse

# ---- STEP 1: Create toy data ----
# 6 samples (obs), 4 proteins (var)
X = np.array([
    [1,    np.nan, 10,   100,  500,  2.0],     # BE_kd1
    [2,    20,     np.nan, 200,  500,  2.5],   # BE_kd2
    [np.nan, 30,   30,   np.nan, 500,  3.0],   # BE_kd3
    [100,  np.nan, 1000, 500,  500,  2.8],     # AS_sc1
    [200, 400,     np.nan, np.nan, 500,  2.2], # AS_sc2
    [np.nan, 600,  3000, 1500, 500,  2.1],     # AS_sc3
])


obs = pd.DataFrame({
    "cellline": ["BE", "BE", "BE", "AS", "AS", "AS"],
    "treatment": ["kd", "kd", "kd", "sc", "sc", "sc"]
}, index=[f"sample{i+1}" for i in range(6)])

var = pd.DataFrame({"Genes": ["GAPDH", "ACTB", "TUBB", "MYH9", "HSP90", "RPLP0"]}, index=[f"P{i+1}" for i in range(6)])

adata = AnnData(X=X, obs=obs, var=var)

# ---- STEP 2: Wrap in dummy pAnnData ----
class DummyPAnnData:
    def __init__(self, adata):
        self.prot = adata
        self.pep = None
        self._history = []
        
    def _check_data(self, on):
        return on == 'protein' and self.prot is not None

    @property
    def _cached_identifier_maps_protein(self):
        if not hasattr(self, "_gene_maps_protein"):
            self._gene_maps_protein = self._build_identifier_maps(self.prot)
        return self._gene_maps_protein

    def set_X(self, layer="X", on="protein"):
        adata = self.prot if on == "protein" else self.pep
        adata.X = adata.layers[layer].copy()
        print(f"ℹ️ Set {on} data to layer {layer}.")

    def get_gene_maps(self, on='protein'):
        """
        Returns identifier mapping dictionaries:
        - on='protein': (gene → accession, accession → gene)
        - on='peptide': (protein accession → peptide, peptide → protein accession)

        Alias: get_gene_maps() for compatibility.
        """
        if on == 'protein':
            return self._cached_identifier_maps_protein
        else:
            raise ValueError(f"Invalid value for 'on': {on}. Must be 'protein' or 'peptide'.")

    def _build_identifier_maps(self, adata, gene_col="Genes"):
        """
        Builds bidirectional mapping for:
        - protein: gene ↔ accession
        - peptide: peptide ↔ protein accession

        Returns: (forward, reverse)
        """
        from pandas import notna

        forward = {}
        reverse = {}

        if adata is self.prot:
            if gene_col in adata.var.columns:
                for acc, gene in zip(adata.var_names, adata.var[gene_col]):
                    if notna(gene):
                        gene = str(gene)
                        forward[gene] = acc
                        reverse[acc] = gene

        elif adata is self.pep:
            prot_acc_col = utils.get_pep_prot_mapping(self)
            pep_to_prot = adata.var[prot_acc_col]
            for pep, prot in zip(adata.var_names, pep_to_prot):
                if notna(prot):
                    forward[prot] = pep
                    reverse[pep] = prot

        return forward, reverse

    impute = impute
    normalize = normalize
    _normalize_helper = _normalize_helper

pdata = DummyPAnnData(adata)


# impute checks

In [5]:
pdata.impute(method='mean', set_X = False)
pdata.prot.layers["X_impute_mean"]

ℹ️ Global imputation using 'mean'. Layer saved as 'X_impute_mean'.
✅ 8 values imputed.


array([[1.000e+00, 2.625e+02, 1.000e+01, 1.000e+02],
       [2.000e+00, 2.000e+01, 1.010e+03, 2.000e+02],
       [7.575e+01, 3.000e+01, 3.000e+01, 5.750e+02],
       [1.000e+02, 2.625e+02, 1.000e+03, 5.000e+02],
       [2.000e+02, 4.000e+02, 1.010e+03, 5.750e+02],
       [7.575e+01, 6.000e+02, 3.000e+03, 1.500e+03]])

In [6]:
pdata.impute(classes=["cellline", "treatment"], method="mean", set_X = False)
print("\n✅ Imputed matrix:")
print(np.round(pdata.prot.layers['X_impute_mean'], 1))


ℹ️ Group-wise imputation using 'mean' on class(es): ['cellline', 'treatment']. Layer saved as 'X_impute_mean'.
✅ 8 values imputed total.
   - AS, sc: 4 values
   - BE, kd: 4 values

✅ Imputed matrix:
[[1.0e+00 2.5e+01 1.0e+01 1.0e+02]
 [2.0e+00 2.0e+01 2.0e+01 2.0e+02]
 [1.5e+00 3.0e+01 3.0e+01 1.5e+02]
 [1.0e+02 5.0e+02 1.0e+03 5.0e+02]
 [2.0e+02 4.0e+02 2.0e+03 1.0e+03]
 [1.5e+02 6.0e+02 3.0e+03 1.5e+03]]


In [7]:
pdata.impute(classes=["cellline", "treatment"], method="min", set_X = False)
print("\n✅ Imputed matrix:")
print(np.round(pdata.prot.layers['X_impute_min'], 1))


ℹ️ Group-wise imputation using 'min' on class(es): ['cellline', 'treatment']. Layer saved as 'X_impute_min'.
✅ 8 values imputed total.
   - AS, sc: 4 values
   - BE, kd: 4 values

✅ Imputed matrix:
[[1.0e+00 2.0e+01 1.0e+01 1.0e+02]
 [2.0e+00 2.0e+01 1.0e+01 2.0e+02]
 [1.0e+00 3.0e+01 3.0e+01 1.0e+02]
 [1.0e+02 4.0e+02 1.0e+03 5.0e+02]
 [2.0e+02 4.0e+02 1.0e+03 5.0e+02]
 [1.0e+02 6.0e+02 3.0e+03 1.5e+03]]


In [8]:
pdata.impute(method='min', set_X = False)
pdata.prot.layers["X_impute_min"]

ℹ️ Global imputation using 'min'. Layer saved as 'X_impute_min'.
✅ 8 values imputed.


array([[1.0e+00, 2.0e+01, 1.0e+01, 1.0e+02],
       [2.0e+00, 2.0e+01, 1.0e+01, 2.0e+02],
       [1.0e+00, 3.0e+01, 3.0e+01, 1.0e+02],
       [1.0e+02, 2.0e+01, 1.0e+03, 5.0e+02],
       [2.0e+02, 4.0e+02, 1.0e+01, 1.0e+02],
       [1.0e+00, 6.0e+02, 3.0e+03, 1.5e+03]])

In [10]:
pdata.impute(method='knn', set_X = False, n_neighbors=2)
pdata.prot.layers["X_impute_knn"]

ℹ️ Global imputation using 'knn'. Layer saved as 'X_impute_knn'.
✅ 8 values imputed.


array([[1.00e+00, 2.50e+01, 1.00e+01, 1.00e+02],
       [2.00e+00, 2.00e+01, 2.00e+01, 2.00e+02],
       [1.50e+00, 3.00e+01, 3.00e+01, 1.50e+02],
       [1.00e+02, 2.10e+02, 1.00e+03, 5.00e+02],
       [2.00e+02, 4.00e+02, 5.05e+02, 3.00e+02],
       [1.01e+02, 6.00e+02, 3.00e+03, 1.50e+03]])

# normalize checks

In [88]:
pdata.prot.var

Unnamed: 0,Genes
P1,GAPDH
P2,ACTB
P3,TUBB
P4,MYH9
P5,HSP90
P6,RPLP0


In [80]:
# Test global normalization by 'sum'
pdata.normalize(method='sum', classes=None, on='protein', set_X=False, use_nonmissing=True)

# Inspect result
np.set_printoptions(suppress=True, precision=2)
layer_name = "X_norm_sum"
print("\nNormalized matrix:")
print(pdata.prot.layers[layer_name])

# Check that all row sums are now equal
row_sums = np.nansum(pdata.prot.layers[layer_name], axis=1)
print("\nRow sums after normalization:", row_sums)
print("Are all row sums equal?", np.allclose(row_sums, row_sums[0]))


ℹ️ Normalizing using only fully observed columns: [4 5]
ℹ️ Global normalization using 'sum' (using only fully observed columns). Layer saved as 'X_norm_sum'.
✅ Normalized all 6 samples.

Normalized matrix:
[[   1.       nan   10.02  100.2   501.      2.  ]
 [   2.     20.02     nan  200.2   500.5     2.5 ]
 [    nan   30.     30.       nan  500.      3.  ]
 [ 100.04     nan 1000.4   500.2   500.2     2.8 ]
 [ 200.32  400.64     nan     nan  500.8     2.2 ]
 [    nan  601.08 3005.38 1502.69  500.9     2.1 ]]

Row sums after normalization: [ 614.22  725.22  563.   2103.64 1103.96 5612.14]
Are all row sums equal? False


In [81]:
# Test global normalization by 'sum'
pdata.normalize(method='sum', classes=None, on='protein', set_X=False, use_nonmissing=False)

# Inspect result
np.set_printoptions(suppress=True, precision=5)
layer_name = "X_norm_sum"
print("\nNormalized matrix:")
print(pdata.prot.layers[layer_name])

# Check that all row sums are now equal
row_sums = np.nansum(pdata.prot.layers[layer_name], axis=1)
print("\nRow sums after normalization:", row_sums)
print("Are all row sums equal?", np.allclose(row_sums, row_sums[0]))


ℹ️ Global normalization using 'sum'. Layer saved as 'X_norm_sum'.
✅ Normalized all 6 samples.

Normalized matrix:
[[   9.13883        nan   91.38825  913.88254 4569.41272   18.27765]
 [  15.46473  154.64734        nan 1546.47343 3866.18357   19.33092]
 [       nan  298.51332  298.51332        nan 4975.22202   29.85133]
 [ 266.41145        nan 2664.11451 1332.05726 1332.05726    7.45952]
 [1016.53058 2033.06115        nan        nan 2541.32644   11.18184]
 [       nan  600.      3000.      1500.       500.         2.1    ]]

Row sums after normalization: [5602.1 5602.1 5602.1 5602.1 5602.1 5602.1]
Are all row sums equal? True


In [77]:
# Reset to original data first
pdata.prot.X = X.copy()

# Test group-wise normalization by 'mean'
pdata.normalize(method='mean', classes='cellline', on='protein',set_X=False, use_nonmissing=True)

# Inspect result
print("\nNormalized (group-wise mean) matrix:")
print(pdata.prot.layers["X_norm_mean"])

# Check that row means within each group are equal
df = pd.DataFrame(pdata.prot.layers["X_norm_mean"])
df['group'] = pdata.prot.obs['cellline'].values
means_by_group = df.groupby('group').apply(lambda g: np.nanmean(g.drop(columns='group').values, axis=1))
print("\nRow means by group:")
print(means_by_group)

# Check that all row means within each group are equal
print("\nAre all row means within each group equal?")


ℹ️ Normalizing using only fully observed columns: [4 5]
ℹ️ Normalizing using only fully observed columns: [4 5]
ℹ️ Group-wise normalization using 'mean' on class(es): cellline (using only fully observed columns). Layer saved as 'X_norm_mean'.
✅ Normalized 6 samples total.
   - AS: 3 samples normalized
   - BE: 3 samples normalized

Normalized (group-wise mean) matrix:
[[   1.00199        nan   10.01992  100.1992   500.99602    2.00398]
 [   2.00199   20.0199         nan  200.199    500.49751    2.50249]
 [       nan   30.        30.             nan  500.         3.     ]
 [ 100.             nan 1000.       500.       500.         2.8    ]
 [ 200.23895  400.4779         nan        nan  500.59737    2.20263]
 [       nan  600.83649 3004.18243 1502.09122  500.69707    2.10293]]

Row means by group:
group
AS    [420.56000000000006, 275.8792114695341, 1121.9...
BE      [122.84422310756972, 145.0441791044776, 140.75]
dtype: object

Are all row means within each group equal?


In [58]:
# Reset to original data first
pdata.prot.X = X.copy()

# Test group-wise normalization by 'median'
pdata.normalize(method='median', classes='cellline', on='protein',set_X=False)

# Inspect result
print("\nNormalized (group-wise median) matrix:")
print(pdata.prot.layers["X_norm_median"])

# Check that row means within each group are equal
df = pd.DataFrame(pdata.prot.layers["X_norm_median"])
df['group'] = pdata.prot.obs['cellline'].values
median_by_group = df.groupby('group').apply(lambda g: np.nanmedian(g.drop(columns='group').values, axis=1))
print("\nRow median by group:")
print(median_by_group)

# Check that all row median within each group are equal
print("\nAre all row median within each group equal?")


[500. 300. 600.]
[1.2 2.  1. ]
[10. 20. 30.]
[3.  1.5 1. ]
ℹ️ Group-wise normalization using 'median' on class(es): cellline. Layer saved as 'X_norm_median'.
✅ Normalized 6 samples total.
   - AS: 3 samples normalized
   - BE: 3 samples normalized

Normalized (group-wise median) matrix:
[[   3.       nan   30.    300.   1500.      6.  ]
 [   3.     30.       nan  300.    750.      3.75]
 [    nan   30.     30.       nan  500.      3.  ]
 [ 120.       nan 1200.    600.    600.      3.36]
 [ 400.    800.       nan     nan 1000.      4.4 ]
 [    nan  600.   3000.   1500.    500.      2.1 ]]

Row median by group:
group
AS    [600.0, 600.0, 600.0]
BE       [30.0, 30.0, 30.0]
dtype: object

Are all row median within each group equal?


In [96]:
pdata = DummyPAnnData(adata)

# Test reference feature normalization using gene names
pdata.normalize(
    method='reference_feature',
    reference_columns=['GAPDH', 'ACTB'],
    reference_method='mean',
    set_X=False
)

# Inspect result
print("\nNormalized matrix using reference feature:")
print(pdata.prot.layers["X_norm_reference_feature"])
# Check that reference columns median across all rows
df = pd.DataFrame(pdata.prot.layers["X_norm_reference_feature"])
df['group'] = pdata.prot.obs['cellline'].values
median_by_group = df.groupby('group').apply(lambda g: np.nanmedian(g.drop(columns='group').values, axis=1))
print("\nRow median by group:")
print(median_by_group)



ℹ️ Normalizing using found reference columns: ['P1', 'P2']
ℹ️ Global normalization using 'reference_feature'. Layer saved as 'X_norm_reference_feature'.
✅ Normalized all 6 samples.

Normalized matrix using reference feature:
[[   200.         nan   2000.    20000.   100000.      400.  ]
 [   130.     1300.         nan  13000.    32500.      162.5 ]
 [      nan    600.      600.         nan  10000.       60.  ]
 [   200.         nan   2000.     1000.     1000.        5.6 ]
 [   250.      500.         nan       nan    625.        2.75]
 [      nan    600.     3000.     1500.      500.        2.1 ]]

Row median by group:
group
AS     [1000.0, 375.0, 600.0]
BE    [2000.0, 1300.0, 600.0]
dtype: object
