In [1]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from scviz import utils
from scviz import setup

# A = [[7,2,3],[4,np.nan,6],[10,5,np.nan],[np.nan,np.nan,2]]
# print(A)
# print("\n\n")
# column_trans = ColumnTransformer(
# [('imp_col1', SimpleImputer(strategy='mean'), [1]),
#  ('imp_col2', SimpleImputer(strategy='constant', fill_value=29), [2,3])],
# remainder='passthrough')

# print(column_trans.fit_transform(A)[:, [2,0,1]])

In [None]:
def impute(self, classes=None, layer="X", method='mean', on='protein', set_X=True, **kwargs):
    """
    Impute missing values across samples (globally or within classes) using SimpleImputer.

    Parameters:
        classes (str or list): Class columns in .obs to group by.
        layer (str): Data layer to impute from.
        method (str): 'mean', 'median', or 'min'.
        on (str): 'protein' or 'peptide'.
        set_X (bool): Whether to set .X to the imputed result.
    """
    from sklearn.impute import SimpleImputer, KNNImputer
    from scipy import sparse
    from scviz import utils


    if not self._check_data(on):
        return

    adata = self.prot if on == 'protein' else self.pep
    if layer != "X" and layer not in adata.layers:
        raise ValueError(f"Layer '{layer}' not found in .{on}.")

    impute_data = adata.layers[layer] if layer != "X" else adata.X
    was_sparse = sparse.issparse(impute_data)
    impute_data = impute_data.toarray() if was_sparse else impute_data.copy()
    original_data = impute_data.copy()

    layer_name = f"X_impute_{method}"

    if method not in {"mean", "median", "min","knn"}:
        raise ValueError(f"Unsupported method: {method}")

    if classes is None:
        # Global imputation
        if method == 'min':
            min_vals = np.nanmin(impute_data, axis=0)
            min_vals = np.where(np.isnan(min_vals), 0, min_vals)
            mask = np.isnan(impute_data)
            impute_data[mask] = np.take(min_vals, np.where(mask)[1])
        elif method == 'knn':
            n_neighbors = kwargs.get('n_neighbors', 3)
            imputer = KNNImputer(n_neighbors=n_neighbors)
            impute_data = imputer.fit_transform(impute_data)
        else:
            imputer = SimpleImputer(strategy=method)
            impute_data = imputer.fit_transform(impute_data)

        print(f"ℹ️ Global imputation using '{method}'. Layer saved as '{layer_name}'.")

    else:
        # Group-wise imputation
        if method == 'knn':
            raise ValueError("KNN imputation is not supported for group-wise imputation.")

        sample_names = utils.get_samplenames(adata, classes)
        sample_names = np.array(sample_names)
        unique_groups = np.unique(sample_names)

        for group in unique_groups:
            idx = np.where(sample_names == group)[0]
            group_data = impute_data[idx, :]

            if method == 'min':
                min_vals = np.nanmin(group_data, axis=0)
                min_vals = np.where(np.isnan(min_vals), 0, min_vals)
                mask = np.isnan(group_data)
                group_data[mask] = np.take(min_vals, np.where(mask)[1])
                imputed_group = group_data
            else:
                imputer = SimpleImputer(strategy=method)
                imputed_group = imputer.fit_transform(group_data)

            impute_data[idx, :] = imputed_group

        print(f"ℹ️ Group-wise imputation using '{method}' on class(es): {classes}. Layer saved as '{layer_name}'.")

    summary_lines = []
    if classes is None:
        num_imputed = np.sum(np.isnan(original_data) & ~np.isnan(impute_data))
        summary_lines.append(f"✅ {num_imputed} values imputed.")
    else:
        sample_names = utils.get_samplenames(adata, classes)
        sample_names = np.array(sample_names)
        unique_groups = np.unique(sample_names)

        counts_by_group = {}
        for group in unique_groups:
            idx = np.where(sample_names == group)[0]
            before = original_data[idx, :]
            after = impute_data[idx, :]
            mask = np.isnan(before) & ~np.isnan(after)
            counts_by_group[group] = np.sum(mask)

        total = sum(counts_by_group.values())
        summary_lines.append(f"✅ {total} values imputed total.")
        for group, count in counts_by_group.items():
            summary_lines.append(f"   - {group}: {count} values")

    print("\n".join(summary_lines))

    adata.layers[layer_name] = sparse.csr_matrix(impute_data) if was_sparse else impute_data

    if set_X:
        self.set_X(layer=layer_name, on=on)

    self._history.append(
        f"{on}: Imputed layer '{layer}' using '{method}' (grouped by {classes if classes else 'ALL'}). Stored in '{layer_name}'."
    )


In [3]:
# check sckit-learn version
from sklearn import __version__ as sklearn_version
from packaging.version import parse as parse_version

print(sklearn_version)
print(parse_version(sklearn_version))

1.4.2
1.4.2


In [4]:
import numpy as np
import pandas as pd
from anndata import AnnData
from scipy import sparse

# ---- STEP 1: Create toy data ----
# 6 samples (obs), 4 proteins (var)
X = np.array([
    [1,    np.nan, 10,   100],   # BE_kd1
    [2,    20,     np.nan, 200],  # BE_kd2
    [np.nan, 30,   30,   np.nan], # BE_kd3
    [100,  np.nan, 1000, 500],   # AS_sc1
    [200, 400,     np.nan, np.nan], # AS_sc2
    [np.nan, 600,  3000, 1500],  # AS_sc3
])


obs = pd.DataFrame({
    "cellline": ["BE", "BE", "BE", "AS", "AS", "AS"],
    "treatment": ["kd", "kd", "kd", "sc", "sc", "sc"]
}, index=[f"sample{i+1}" for i in range(6)])

var = pd.DataFrame(index=[f"P{i+1}" for i in range(4)])  # protein names

adata = AnnData(X=X, obs=obs, var=var)

# ---- STEP 2: Wrap in dummy pAnnData ----
class DummyPAnnData:
    def __init__(self, adata):
        self.prot = adata
        self.pep = None
        self._history = []
        
    def _check_data(self, on):
        return on == 'protein' and self.prot is not None

    def set_X(self, layer="X", on="protein"):
        adata = self.prot if on == "protein" else self.pep
        adata.X = adata.layers[layer].copy()

    impute = impute

pdata = DummyPAnnData(adata)


In [5]:
pdata.impute(method='mean', set_X = False)
pdata.prot.layers["X_impute_mean"]

ℹ️ Global imputation using 'mean'. Layer saved as 'X_impute_mean'.
✅ 8 values imputed.


array([[1.000e+00, 2.625e+02, 1.000e+01, 1.000e+02],
       [2.000e+00, 2.000e+01, 1.010e+03, 2.000e+02],
       [7.575e+01, 3.000e+01, 3.000e+01, 5.750e+02],
       [1.000e+02, 2.625e+02, 1.000e+03, 5.000e+02],
       [2.000e+02, 4.000e+02, 1.010e+03, 5.750e+02],
       [7.575e+01, 6.000e+02, 3.000e+03, 1.500e+03]])

In [6]:
pdata.impute(classes=["cellline", "treatment"], method="mean", set_X = False)
print("\n✅ Imputed matrix:")
print(np.round(pdata.prot.layers['X_impute_mean'], 1))


ℹ️ Group-wise imputation using 'mean' on class(es): ['cellline', 'treatment']. Layer saved as 'X_impute_mean'.
✅ 8 values imputed total.
   - AS, sc: 4 values
   - BE, kd: 4 values

✅ Imputed matrix:
[[1.0e+00 2.5e+01 1.0e+01 1.0e+02]
 [2.0e+00 2.0e+01 2.0e+01 2.0e+02]
 [1.5e+00 3.0e+01 3.0e+01 1.5e+02]
 [1.0e+02 5.0e+02 1.0e+03 5.0e+02]
 [2.0e+02 4.0e+02 2.0e+03 1.0e+03]
 [1.5e+02 6.0e+02 3.0e+03 1.5e+03]]


In [7]:
pdata.impute(classes=["cellline", "treatment"], method="min", set_X = False)
print("\n✅ Imputed matrix:")
print(np.round(pdata.prot.layers['X_impute_min'], 1))


ℹ️ Group-wise imputation using 'min' on class(es): ['cellline', 'treatment']. Layer saved as 'X_impute_min'.
✅ 8 values imputed total.
   - AS, sc: 4 values
   - BE, kd: 4 values

✅ Imputed matrix:
[[1.0e+00 2.0e+01 1.0e+01 1.0e+02]
 [2.0e+00 2.0e+01 1.0e+01 2.0e+02]
 [1.0e+00 3.0e+01 3.0e+01 1.0e+02]
 [1.0e+02 4.0e+02 1.0e+03 5.0e+02]
 [2.0e+02 4.0e+02 1.0e+03 5.0e+02]
 [1.0e+02 6.0e+02 3.0e+03 1.5e+03]]


In [8]:
pdata.impute(method='min', set_X = False)
pdata.prot.layers["X_impute_min"]

ℹ️ Global imputation using 'min'. Layer saved as 'X_impute_min'.
✅ 8 values imputed.


array([[1.0e+00, 2.0e+01, 1.0e+01, 1.0e+02],
       [2.0e+00, 2.0e+01, 1.0e+01, 2.0e+02],
       [1.0e+00, 3.0e+01, 3.0e+01, 1.0e+02],
       [1.0e+02, 2.0e+01, 1.0e+03, 5.0e+02],
       [2.0e+02, 4.0e+02, 1.0e+01, 1.0e+02],
       [1.0e+00, 6.0e+02, 3.0e+03, 1.5e+03]])

In [10]:
pdata.impute(method='knn', set_X = False, n_neighbors=2)
pdata.prot.layers["X_impute_knn"]

ℹ️ Global imputation using 'knn'. Layer saved as 'X_impute_knn'.
✅ 8 values imputed.


array([[1.00e+00, 2.50e+01, 1.00e+01, 1.00e+02],
       [2.00e+00, 2.00e+01, 2.00e+01, 2.00e+02],
       [1.50e+00, 3.00e+01, 3.00e+01, 1.50e+02],
       [1.00e+02, 2.10e+02, 1.00e+03, 5.00e+02],
       [2.00e+02, 4.00e+02, 5.05e+02, 3.00e+02],
       [1.01e+02, 6.00e+02, 3.00e+03, 1.50e+03]])

In [6]:
from sklearn.impute import SimpleImputer
import numpy as np

# Original untouched matrix
X = pdata.prot.X.copy()

# Correct grouping: each group is defined by cellline + treatment
sample_names = pdata.prot.obs["cellline"] + "_" + pdata.prot.obs["treatment"]
sample_names = sample_names.values

unique_groups = np.unique(sample_names)
print(f"Found {len(unique_groups)} unique groups: {unique_groups}")

# We'll make a new matrix to store the results
X_imputed = X.copy()

for group in unique_groups:
    idx = np.where(sample_names == group)[0]
    print(f"\n🟦 Group: {group} (samples: {idx})")

    group_data = X[idx, :]             # shape: (n_samples_in_group, n_proteins)
    print(f" - Group shape: {group_data.shape}")

    imputer = SimpleImputer(strategy="mean")  # Can also be 'median'
    imputed_group = imputer.fit_transform(group_data)

    # Store the imputed values in correct rows
    X_imputed[idx, :] = imputed_group

print("\n✅ Final imputed matrix:")
print(np.round(X_imputed, 1))


Found 2 unique groups: ['AS_sc' 'BE_kd']

🟦 Group: AS_sc (samples: [3 4 5])
 - Group shape: (3, 4)

🟦 Group: BE_kd (samples: [0 1 2])
 - Group shape: (3, 4)

✅ Final imputed matrix:
[[1.0e+00 2.5e+01 1.0e+01 1.0e+02]
 [2.0e+00 2.0e+01 2.0e+01 2.0e+02]
 [1.5e+00 3.0e+01 3.0e+01 1.5e+02]
 [1.0e+02 5.0e+02 1.0e+03 5.0e+02]
 [2.0e+02 4.0e+02 2.0e+03 1.0e+03]
 [1.5e+02 6.0e+02 3.0e+03 1.5e+03]]


In [22]:
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

# Custom test matrix, shape (6 samples × 4 proteins)
X = np.array([
    [1,    np.nan, 10,   100],   # BE_kd1
    [2,    20,     np.nan, 200],  # BE_kd2
    [np.nan, 30,   30,   np.nan], # BE_kd3
    [100,  np.nan, 1000, 500],   # AS_sc1
    [200, 400,     np.nan, np.nan], # AS_sc2
    [np.nan, 600,  3000, 1500],  # AS_sc3
])

obs = pd.DataFrame({
    "cellline": ["BE", "BE", "BE", "AS", "AS", "AS"],
    "treatment": ["kd", "kd", "kd", "sc", "sc", "sc"]
}, index=[f"sample{i+1}" for i in range(6)])

# Create group labels
sample_names = obs["cellline"] + "_" + obs["treatment"]
unique_groups = sample_names.unique()

X_imputed = X.copy()

for group in unique_groups:
    idx = np.where(sample_names == group)[0]
    group_data = X[idx, :]  # shape: (n_samples, n_proteins)

    imputer = SimpleImputer(strategy="mean")
    imputed_group = imputer.fit_transform(group_data)

    X_imputed[idx, :] = imputed_group

print("\n✅ Final imputed matrix:")
print(np.round(X_imputed, 1))



✅ Final imputed matrix:
[[1.0e+00 2.5e+01 1.0e+01 1.0e+02]
 [2.0e+00 2.0e+01 2.0e+01 2.0e+02]
 [1.5e+00 3.0e+01 3.0e+01 1.5e+02]
 [1.0e+02 5.0e+02 1.0e+03 5.0e+02]
 [2.0e+02 4.0e+02 2.0e+03 1.0e+03]
 [1.5e+02 6.0e+02 3.0e+03 1.5e+03]]
