In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import scipy.sparse as sp

# ---- CONFIG ----
RAW_LAYER_NAME = "raw"   # change to the right layer name in ad_raw if different (else we'll use ad_raw.X)

# ---- 1) Build a robust, unique matching key on both objects ----
def build_match_key(ann, sample_col="grid_label"):
    # Prefer a stable barcode column if present, else use obs_names
    if "barcode" in ann.obs:
        base = ann.obs["barcode"].astype(str)
    elif "CellID" in ann.obs:
        base = ann.obs["CellID"].astype(str)
    else:
        base = pd.Series(ann.obs_names, index=ann.obs_names, name="barcode").astype(str)

    # If sample_id exists, combine to disambiguate duplicates across samples
    if sample_col in ann.obs:
        key = ann.obs[sample_col].astype(str) + "|" + base
    else:
        key = base.copy()

    # Ensure uniqueness: add suffixes if necessary
    # (pandas will append '.1', '.2', ... to duplicates)
    key = pd.Index(key).to_series(index=ann.obs_names)
    key = key.astype(str)
    # If still non-unique, make unique using cumcount
    if not key.is_unique:
        dup_counts = key.groupby(key).cumcount()
        key = np.where(dup_counts > 0, key + "." + dup_counts.astype(str), key)
        key = pd.Series(key, index=ann.obs_names, name="match_key")

    ann.obs["match_key"] = pd.Series(key, index=ann.obs_names, name="match_key")
    # Keep AnnData index valid (can stay non-unique), matching will use match_key
    return "match_key"




In [None]:
base_data_dir = '/Users/christoffer/work/karolinska/development/RRMap/'
adata = sc.read_h5ad(base_data_dir + '/data/TingTest_RREAE_5k_integration_scaled_clustered_annotated_2_5_anno.h5ad')
ad_raw = sc.read_h5ad('/Users/christoffer/work/karolinska/development/RRMap/data/RREAE_5k_raw_integration.h5ad')

In [None]:
mk_adata = build_match_key(adata, sample_col="grid_label")
mk_raw   = build_match_key(ad_raw, sample_col="grid_label")

# ---- 2) Sanity: check overlap of keys ----
keys_adata = set(adata.obs[mk_adata])
keys_raw   = set(ad_raw.obs[mk_raw])
missing = keys_adata - keys_raw
if missing:
    # Show a few to help debugging
    ex = list(missing)[:10]
    raise ValueError(f"{len(missing)} cells in adata not found in ad_raw by match_key. "
                     f"First few: {ex}. Check that sample_id/barcodes align.")

# ---- 3) Align raw counts rows to adata cell order ----
# Build position indexer: for each adata key, where is it in ad_raw?
raw_pos = pd.Index(ad_raw.obs[mk_raw]).get_indexer(adata.obs[mk_adata])
assert (raw_pos >= 0).all(), "Unexpected negative positions; keys mismatch."

# Get raw counts matrix
Xsrc = ad_raw.layers[RAW_LAYER_NAME] if RAW_LAYER_NAME in ad_raw.layers else ad_raw.X
Xsrc = Xsrc.tocsr() if sp.issparse(Xsrc) else np.asarray(Xsrc)

# Slice rows (cells) into adata order
X_rows = Xsrc[raw_pos, :]

# ---- 4) Align gene order (columns) to adata.var_names ----
if not ad_raw.var_names.equals(adata.var_names):
    # adata genes must all exist in ad_raw
    col_idx = ad_raw.var_names.get_indexer(adata.var_names)
    if (col_idx < 0).any():
        missing_genes = list(adata.var_names[col_idx < 0][:10])
        raise ValueError(f"Some adata genes not in ad_raw: {missing_genes} ...")
    X_rows = X_rows[:, col_idx]

# ---- 5) Store as raw counts layer on adata ----
adata.layers["raw"] = X_rows if sp.issparse(X_rows) else np.asarray(X_rows)
print(f"✅ adata.layers['raw'] set. Shape={adata.layers['raw'].shape}, "
      f"type={'sparse' if sp.issparse(adata.layers['raw']) else 'ndarray'}")

# (Optional) If you want a fully unique AnnData index to avoid future warnings:
# adata.obs_names_make_unique()

In [None]:
#adata.write('../data/RREAE_5k_raw_integration_processed.h5ad')