In [1]:
!pip install mygene




In [6]:
import scanpy as sc
import pandas as pd
import numpy as np
import mygene
import subprocess, sys
from pathlib import Path


def collapse_duplicate_gene_ids(adata, gid_key="ensembl_id", how="sum"):
    """
    Collapse columns whose `gid_key` (default: Ensembl ID) appears >1 ×.
    Aggregation method:
        how="sum"   – add counts   (recommended for raw UMI matrices)
        how="mean"  – mean counts
        how="first" – keep first column verbatim
    """
    gid = adata.var[gid_key].values
    dup_mask = pd.Series(gid).duplicated(keep=False).to_numpy()
    if not dup_mask.any():
        print("✓ All Ensembl IDs already unique.")
        return adata
    
    print(f"⚠️  Collapsing {dup_mask.sum()} duplicated columns…")
    
    _, inv, counts = np.unique(gid, return_inverse=True, return_counts=True)
    groups = {g: np.where(inv == g)[0] for g in np.where(counts > 1)[0]}
    
    X = adata.X.tocsr()
    keep = np.ones(adata.n_vars, dtype=bool)
    
    for idxs in groups.values():
        keep[idxs[1:]] = False              # drop all but the first
        if how == "sum":
            X[:, idxs[0]] = X[:, idxs].sum(axis=1)
        elif how == "mean":
            X[:, idxs[0]] = X[:, idxs].mean(axis=1)
        # "first": leave X[:, idxs[0]] as-is
    
    adata._inplace_subset_var(keep)         # updates .X and .var in one go
    print(f"✓ Collapsed to {adata.n_vars} unique genes.")
    return adata


In [7]:
IN_H5AD  = Path("updated_adata_marked_with_doublets.h5ad")   # input file
OUT_H5AD = Path("updated_adata_unique_ensembl.h5ad")                 # will be written

In [8]:
adata = sc.read_h5ad(IN_H5AD)
adata


AnnData object with n_obs × n_vars = 74453 × 26147
    obs: 'sample_id', 'region', 'brain_region', 'batch', 'donor_id', 'braak_stage', 'sex', 'age', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'log1p_total_counts_mt', 'n_genes', 'solo_doublet', 'solo_singlet'
    var: 'gene_ids', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_mean_counts', 'log1p_total_counts', 'n_cells'

In [9]:
mg          = mygene.MyGeneInfo()
symbols     = adata.var["gene_ids"].astype(str).tolist()

hits = mg.querymany(
    symbols,
    scopes="symbol,alias,ensembl.gene",
    fields="ensembl.gene",
    species="human",
    as_dataframe=True,
    verbose=False
)

ensembl_map = (hits["ensembl.gene"]
               .dropna()
               .explode()
               .astype(str)
               .groupby(level=0).first())        # keep first match per symbol

adata.var["ensembl_id"] = adata.var["gene_ids"].map(ensembl_map) \
                                            .fillna(adata.var["gene_ids"])
print("↳  Example mapping:")
adata.var[["gene_ids", "ensembl_id"]].head()


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed


↳  Example mapping:


Unnamed: 0,gene_ids,ensembl_id
RP11-34P13.7,ENSG00000238009,ENSG00000238009
FO538757.2,ENSG00000279457,ENSG00000279457
AP006222.2,ENSG00000228463,ENSG00000228463
RP4-669L17.10,ENSG00000237094,ENSG00000237094
RP5-857K21.4,ENSG00000230021,ENSG00000230021


In [10]:
adata = collapse_duplicate_gene_ids(adata, gid_key="ensembl_id", how="sum")

# Make the official gene column the now-unique Ensembl IDs
adata.var["gene_ids"] = adata.var["ensembl_id"]
adata.var.drop(columns=["ensembl_id"], inplace=True)


✓ All Ensembl IDs already unique.


In [11]:
adata.write(OUT_H5AD)
print(f"✔️  Saved cleaned file → {OUT_H5AD}")


✔️  Saved cleaned file → updated_adata_unique_ensembl.h5ad


In [None]:
result_path, has_warnings = validate_h5ad(
    input_path=str(OUT_H5AD),
    output_path=None,          # write next to input; change if you want
    log_level="INFO"
)
print("Validator wrote:", result_path)
print("Had warnings?  ", has_warnings)

In [12]:
import scanpy as sc, pandas as pd, numpy as np
from mygene import MyGeneInfo        # pip install mygene

adata = sc.read_h5ad(IN_H5AD)

# 1. Map your current var.index → Ensembl IDs
mg   = MyGeneInfo()
conv = mg.querymany(adata.var_names.tolist(),
                    scopes="symbol,ensemblgene,entrezgene",
                    fields="ensembl.gene",
                    species="human", as_dataframe=True)

# Build a mapping dict {old_id → ensembl_id}
id_map = (conv["ensembl.gene"]
          .dropna()
          .apply(lambda x: x[0] if isinstance(x, list) else x)
          .to_dict())
adata.var["ensembl_id"] = adata.var_names.map(id_map)

# 2. Keep genes we could map and make them unique
adata = adata[:, adata.var["ensembl_id"].notna()].copy()
dupes = adata.var["ensembl_id"].duplicated(keep=False)

# optional: collapse duplicates by summing counts
for gid in adata.var.loc[dupes, "ensembl_id"].unique():
    cols = np.where(adata.var["ensembl_id"] == gid)[0]
    newX = adata.X[:, cols].sum(axis=1)
    adata.X[:, cols[0]] = newX          # put sum in first column
    adata.X = adata.X[:, ~np.isin(range(adata.n_vars), cols[1:])]  # drop others

# 3. Finalise
adata.var_names    = adata.var["ensembl_id"]      # enforce uniqueness
adata.var.index    = adata.var_names
adata.write_h5ad("for_MapMyCells.h5ad", compression="gzip")


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
414 input query terms found dup hits:	[('LINC01346', 2), ('LINC01141', 2), ('LINC01355', 3), ('LINC01226', 2), ('NFYC-AS1', 2), ('KDM4A-AS
8491 input query terms found no hit:	['RP11-34P13.7', 'FO538757.2', 'AP006222.2', 'RP4-669L17.10', 'RP5-857K21.4', 'RP5-857K21.2', 'RP11-


In [21]:
import scanpy as sc, pandas as pd, numpy as np
adata = sc.read_h5ad("updated_adata_marked_with_doublets.h5ad")

In [8]:
adata

AnnData object with n_obs × n_vars = 74453 × 26147
    obs: 'sample_id', 'region', 'brain_region', 'batch', 'donor_id', 'braak_stage', 'sex', 'age', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'log1p_total_counts_mt', 'n_genes', 'solo_doublet', 'solo_singlet'
    var: 'gene_ids', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_mean_counts', 'log1p_total_counts', 'n_cells'

In [22]:
adata.obs

Unnamed: 0,sample_id,region,brain_region,batch,donor_id,braak_stage,sex,age,n_genes_by_counts,total_counts,...,log1p_n_genes_by_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,log1p_total_counts_mt,n_genes,solo_doublet,solo_singlet
GSM4432645_GSM4432645_AAACCTGAGGATGCGT-1_GSM4432645,GSM4432645,Entorhinal cortex,Entorhinal cortex,C,2,0,Male,60,4620,11301.0,...,8.438366,9.332735,19.653128,25.546412,32.952836,46.243695,6.746412,4620,True,0.199106
GSM4432645_GSM4432645_AAACCTGAGTCAATAG-1_GSM4432645,GSM4432645,Entorhinal cortex,Entorhinal cortex,C,2,0,Male,60,2562,4558.0,...,7.848934,8.424859,19.197016,25.756911,34.510750,49.846424,2.890372,2562,False,0.289714
GSM4432645_GSM4432645_AAACCTGCAAGTAGTA-1_GSM4432645,GSM4432645,Entorhinal cortex,Entorhinal cortex,C,2,0,Male,60,484,634.0,...,6.184149,6.453625,29.179811,39.432177,55.205047,100.000000,3.761200,484,True,0.158051
GSM4432645_GSM4432645_AAACCTGCAGACGTAG-1_GSM4432645,GSM4432645,Entorhinal cortex,Entorhinal cortex,C,2,0,Male,60,244,305.0,...,5.501258,5.723585,36.393443,52.786885,85.573770,100.000000,3.091043,244,False,0.975610
GSM4432645_GSM4432645_AAACCTGCATACTCTT-1_GSM4432645,GSM4432645,Entorhinal cortex,Entorhinal cortex,C,2,0,Male,60,3068,6162.0,...,8.029433,8.726481,18.822002,25.799124,34.252799,50.024339,5.267858,3069,True,0.235072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM4432654_GSM4432654_TTTGTCATCCACGTGG-1_GSM4432654,GSM4432654,Entorhinal cortex,Entorhinal cortex,D,10,6,Male,82,292,363.0,...,5.680173,5.897154,33.333333,47.107438,74.655647,100.000000,1.791759,292,False,0.994819
GSM4432654_GSM4432654_TTTGTCATCGCGGATC-1_GSM4432654,GSM4432654,Entorhinal cortex,Entorhinal cortex,D,10,6,Male,82,498,698.0,...,6.212606,6.549651,33.524355,42.979943,57.306590,100.000000,1.386294,498,False,0.830773
GSM4432654_GSM4432654_TTTGTCATCTATCGCC-1_GSM4432654,GSM4432654,Entorhinal cortex,Entorhinal cortex,D,10,6,Male,82,392,443.0,...,5.976351,6.098074,22.747748,34.009009,56.531532,100.000000,1.945910,393,False,0.838007
GSM4432654_GSM4432654_TTTGTCATCTCGCTTG-1_GSM4432654,GSM4432654,Entorhinal cortex,Entorhinal cortex,D,10,6,Male,82,470,600.0,...,6.154858,6.398595,27.166667,38.333333,55.000000,100.000000,3.713572,470,False,0.773198


In [11]:
adata.var

Unnamed: 0,gene_ids,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,log1p_mean_counts,log1p_total_counts,n_cells
RP11-34P13.7,ENSG00000238009,False,336,0.004064,99.598389,340.0,0.000104,6.047372,336
FO538757.2,ENSG00000279457,False,8310,0.118416,90.067294,9907.0,0.004658,9.846230,8310
AP006222.2,ENSG00000228463,False,1789,0.022722,97.861659,1901.0,0.000816,8.102889,1789
RP4-669L17.10,ENSG00000237094,False,712,0.008881,99.148967,743.0,0.000245,6.899723,712
RP5-857K21.4,ENSG00000230021,False,13117,0.226731,84.321624,18969.0,0.006836,10.230955,13117
...,...,...,...,...,...,...,...,...,...
AC007325.2,ENSG00000277196,False,531,0.006586,99.365311,551.0,0.000175,6.561031,531
BX072566.1,ENSG00000277630,False,14,0.000167,99.983266,14.0,0.000004,2.772589,14
AL354822.1,ENSG00000278384,False,349,0.004339,99.582850,363.0,0.000112,6.115892,349
AC004556.1,ENSG00000276345,False,18,0.000215,99.978485,18.0,0.000020,4.418840,18


In [23]:
adata = adata[~adata.obs["solo_doublet"]].copy()


In [24]:
adata

AnnData object with n_obs × n_vars = 64086 × 26147
    obs: 'sample_id', 'region', 'brain_region', 'batch', 'donor_id', 'braak_stage', 'sex', 'age', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'log1p_total_counts_mt', 'n_genes', 'solo_doublet', 'solo_singlet'
    var: 'gene_ids', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_mean_counts', 'log1p_total_counts', 'n_cells'

In [19]:
# ------------------------------------------------------------
# Clean an .h5ad file so Map My Cells accepts it
# ------------------------------------------------------------
#  1) maps every gene symbol/alias ➜ canonical Ensembl ID
#  2) keeps only genes that could be mapped
#  3) drops any duplicate Ensembl IDs (keeps the first one)
#  4) writes a fresh .h5ad file with unique Ensembl IDs
#
#  Works for human or mouse data.
# ------------------------------------------------------------
#  ❗ Adjust only the three parameters just below
# ------------------------------------------------------------
in_file   = "up_no_doublets.h5ad"   # ← your current file
out_file  = "up_clean_unique.h5ad"  # ← file to upload
species   = "human"                 # "human" or "mouse"
# ------------------------------------------------------------

# If you haven’t already, install prerequisites right here:
# !pip install scanpy mygene --quiet

import scanpy as sc
import pandas as pd
from mygene import MyGeneInfo

print(f"▶ loading {in_file}")


# ------------------------------------------------------------------
# 1. Build a mapping: gene_symbol/alias → canonical Ensembl gene ID
# ------------------------------------------------------------------
mg = MyGeneInfo()
symbol_list = adata.var_names.tolist()
sym2ens = {}

# mygene can take up to 1 000 queries at once, so chunk the list
for start in range(0, len(symbol_list), 1000):
    chunk = symbol_list[start : start + 1000]
    for hit in mg.querymany(chunk,
                            scopes=["symbol", "alias", "name"],
                            fields="ensembl.gene",
                            species=species,
                            as_dataframe=False):
        if hit.get("notfound"):
            continue
        ens_field = hit["ensembl"]
        if isinstance(ens_field, list):
            ens_field = ens_field[0]
        ens_id = ens_field["gene"].split(".")[0]      # strip version suffix
        sym2ens[hit["query"]] = ens_id

print(f"  ↳ mapped {len(sym2ens):,} of {adata.n_vars:,} genes")

# ------------------------------------------------------------------
# 2. Keep only successfully-mapped genes
# ------------------------------------------------------------------
keep_mask = adata.var_names.isin(sym2ens)
adata = adata[:, keep_mask].copy()
adata.var["ensembl_id"] = adata.var_names.map(sym2ens)

# ------------------------------------------------------------------
# 3. Drop duplicate Ensembl IDs (keep the first)
# ------------------------------------------------------------------
dup_mask = adata.var["ensembl_id"].duplicated(keep="first")
n_dups   = dup_mask.sum()
if n_dups:
    print(f"  ↳ found {n_dups:,} duplicate Ensembl IDs — dropping extras")
    adata = adata[:, ~dup_mask].copy()

# Set the Ensembl IDs as the final, unique var index
adata.var_names = adata.var["ensembl_id"]
adata.var_names_make_unique()

# ------------------------------------------------------------------
# 4. Save
# ------------------------------------------------------------------
adata.write(out_file)
print(f"✔ cleaned file written to {out_file}")


Input sequence provided is already in string format. No operation performed


▶ loading up_no_doublets.h5ad


258 input query terms found dup hits:	[('TTLL10', 2), ('DVL1', 2), ('MRPL20', 4), ('ANKRD65', 3), ('SSU72', 10), ('SLC35E2', 2), ('GNB1', 


KeyError: 'ensembl'

In [25]:
adata.var

Unnamed: 0,gene_ids,mt,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,log1p_mean_counts,log1p_total_counts,n_cells
RP11-34P13.7,ENSG00000238009,False,336,0.004064,99.598389,340.0,0.000104,6.047372,336
FO538757.2,ENSG00000279457,False,8310,0.118416,90.067294,9907.0,0.004658,9.846230,8310
AP006222.2,ENSG00000228463,False,1789,0.022722,97.861659,1901.0,0.000816,8.102889,1789
RP4-669L17.10,ENSG00000237094,False,712,0.008881,99.148967,743.0,0.000245,6.899723,712
RP5-857K21.4,ENSG00000230021,False,13117,0.226731,84.321624,18969.0,0.006836,10.230955,13117
...,...,...,...,...,...,...,...,...,...
AC007325.2,ENSG00000277196,False,531,0.006586,99.365311,551.0,0.000175,6.561031,531
BX072566.1,ENSG00000277630,False,14,0.000167,99.983266,14.0,0.000004,2.772589,14
AL354822.1,ENSG00000278384,False,349,0.004339,99.582850,363.0,0.000112,6.115892,349
AC004556.1,ENSG00000276345,False,18,0.000215,99.978485,18.0,0.000020,4.418840,18


In [26]:
adata.obs

Unnamed: 0,sample_id,region,brain_region,batch,donor_id,braak_stage,sex,age,n_genes_by_counts,total_counts,...,log1p_n_genes_by_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,log1p_total_counts_mt,n_genes,solo_doublet,solo_singlet
GSM4432645_GSM4432645_AAACCTGAGTCAATAG-1_GSM4432645,GSM4432645,Entorhinal cortex,Entorhinal cortex,C,2,0,Male,60,2562,4558.0,...,7.848934,8.424859,19.197016,25.756911,34.510750,49.846424,2.890372,2562,False,0.289714
GSM4432645_GSM4432645_AAACCTGCAGACGTAG-1_GSM4432645,GSM4432645,Entorhinal cortex,Entorhinal cortex,C,2,0,Male,60,244,305.0,...,5.501258,5.723585,36.393443,52.786885,85.573770,100.000000,3.091043,244,False,0.975610
GSM4432645_GSM4432645_AAACCTGGTGAGTATA-1_GSM4432645,GSM4432645,Entorhinal cortex,Entorhinal cortex,C,2,0,Male,60,229,271.0,...,5.438079,5.605802,33.948339,52.398524,89.298893,100.000000,2.944439,229,False,0.990093
GSM4432645_GSM4432645_AAACCTGGTTATCACG-1_GSM4432645,GSM4432645,Entorhinal cortex,Entorhinal cortex,C,2,0,Male,60,317,372.0,...,5.762051,5.921578,28.225806,41.666667,68.548387,100.000000,2.079442,317,False,0.987308
GSM4432645_GSM4432645_AAACCTGTCAAGCCTA-1_GSM4432645,GSM4432645,Entorhinal cortex,Entorhinal cortex,C,2,0,Male,60,376,479.0,...,5.932245,6.173786,31.941545,42.379958,63.256785,100.000000,2.639057,376,False,0.434747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM4432654_GSM4432654_TTTGTCATCCACGTGG-1_GSM4432654,GSM4432654,Entorhinal cortex,Entorhinal cortex,D,10,6,Male,82,292,363.0,...,5.680173,5.897154,33.333333,47.107438,74.655647,100.000000,1.791759,292,False,0.994819
GSM4432654_GSM4432654_TTTGTCATCGCGGATC-1_GSM4432654,GSM4432654,Entorhinal cortex,Entorhinal cortex,D,10,6,Male,82,498,698.0,...,6.212606,6.549651,33.524355,42.979943,57.306590,100.000000,1.386294,498,False,0.830773
GSM4432654_GSM4432654_TTTGTCATCTATCGCC-1_GSM4432654,GSM4432654,Entorhinal cortex,Entorhinal cortex,D,10,6,Male,82,392,443.0,...,5.976351,6.098074,22.747748,34.009009,56.531532,100.000000,1.945910,393,False,0.838007
GSM4432654_GSM4432654_TTTGTCATCTCGCTTG-1_GSM4432654,GSM4432654,Entorhinal cortex,Entorhinal cortex,D,10,6,Male,82,470,600.0,...,6.154858,6.398595,27.166667,38.333333,55.000000,100.000000,3.713572,470,False,0.773198


In [27]:
# --- Move Ensembl IDs into the index ---------------------------------
adata.var["gene_symbol"] = adata.var_names      # keep symbols as a column
adata.var_names = adata.var["gene_ids"]         # set index = Ensembl IDs
adata.var_names_make_unique()                   # be 100 % sure they’re unique

# --- Sanity checks ----------------------------------------------------
assert adata.var_names.is_unique
assert adata.var_names.str.startswith("ENS").all()

# (optional) if you no longer need the old column
#adata.var.drop(columns=["gene_ids"], inplace=True)

# --- Save -------------------------------------------------------------
adata.write("ready_for_mapmycells.h5ad")
print("✅ File saved: ready_for_mapmycells.h5ad — upload this to Map My Cells.")


✅ File saved: ready_for_mapmycells.h5ad — upload this to Map My Cells.
