In [1]:
import anndata as ad
import scanpy as sc
import glob

In [2]:
files = glob.glob("/mnt/hdd2/tan/competition_support_set_sorted/*.h5")

In [5]:
from __future__ import annotations
from collections import Counter, defaultdict
from typing import Iterable, Optional, List, Dict, Tuple
import gc
import numpy as np
import scanpy as sc
import anndata as ad

def compute_consensus_hvgs(
    h5ad_paths: Iterable[str],
    *,
    n_top_genes_per_dataset: int = 4000,
    min_genes: int = 4000,
    layer: Optional[str] = None,
    flavor: str = "seurat",
    gene_key: Optional[str] = None,
    gene_transform: Optional[callable] = None,
    verbose: bool = True,
    gc_every_iter: bool = True,   # NEW: force garbage collection after each file
) -> List[str]:
    counts: Counter[str] = Counter()
    score_lists: Dict[str, List[float]] = defaultdict(list)
    all_seen_genes: set[str] = set()

    def _pick_score_col(cols: Iterable[str]) -> Optional[str]:
        for c in ("dispersions_norm", "variances_norm", "dispersions", "variances"):
            if c in cols:
                return c
        return None

    paths = list(h5ad_paths)  # avoid re-materializing an iterator repeatedly
    for i, path in enumerate(paths):
        if verbose:
            print(f"[{i+1}/{len(paths)}] reading {path}")

        adata = None
        hvg_df = None
        hvgs = None
        scores = None

        try:
            adata: ad.AnnData = sc.read_h5ad(path)

            if gene_key is not None:
                if gene_key not in adata.var.columns:
                    raise KeyError(f"{path}: gene_key '{gene_key}' not found in .var")
                adata.var_names = adata.var[gene_key].astype(str)
                adata.var_names_make_unique()

            if gene_transform is not None:
                new_names = [gene_transform(g) for g in adata.var_names]
                adata.var_names = sc.utils.make_index_unique(np.array(new_names, dtype=object))

            k = min(n_top_genes_per_dataset, adata.n_vars)
            hvg_df = sc.pp.highly_variable_genes(
                adata,
                flavor=flavor,
                n_top_genes=k,
                layer=layer,
                inplace=False,
            )
            score_col = _pick_score_col(hvg_df.columns)

            hvgs = hvg_df.index[hvg_df["highly_variable"]].tolist()
            counts.update(hvgs)

            if score_col is not None:
                scores = hvg_df[score_col].to_dict()
                for g, s in scores.items():
                    if np.isfinite(s):
                        score_lists[g].append(float(s))

            all_seen_genes.update(hvg_df.index.tolist())

        finally:
            # Proactively drop large objects and trigger GC
            try:
                if adata is not None and getattr(adata, "isbacked", False):
                    # Close memmap/file handle if ever used in backed mode
                    adata.file.close()
            except Exception:
                pass

            del hvgs
            del hvg_df
            del scores
            del adata

            if gc_every_iter:
                gc.collect()

    # Build consensus (≥ 2 datasets), then top-up to min_genes
    consensus = [g for g, c in counts.items() if c >= 2]

    if len(consensus) < min_genes:
        ranked = sorted(
            all_seen_genes,
            key=lambda g: (
                counts[g],
                max(score_lists.get(g, [float("-inf")])),
            ),
            reverse=True,
        )
        chosen = set(consensus)
        for g in ranked:
            if len(chosen) >= min_genes:
                break
            if g not in chosen:
                chosen.add(g)
        consensus = list(chosen)

    def sort_key(g: str):
        freq = counts[g]
        best = max(score_lists.get(g, [float("-inf")]))
        return (-freq, -best, g)

    consensus_sorted = sorted(consensus, key=sort_key)
    if verbose:
        print(f"Consensus HVGs: {len(consensus_sorted)} genes "
              f"(>=2 datasets: {sum(counts[g] >= 2 for g in consensus_sorted)})")
    return consensus_sorted


In [6]:
hvg = compute_consensus_hvgs(files)

[1/14] reading /mnt/hdd2/tan/competition_support_set_sorted/jiang_bxpc3.h5
[2/14] reading /mnt/hdd2/tan/competition_support_set_sorted/mcfaline_a172.h5
[3/14] reading /mnt/hdd2/tan/competition_support_set_sorted/mcfaline_t98g.h5
[4/14] reading /mnt/hdd2/tan/competition_support_set_sorted/mcfaline_u87mg.h5
[5/14] reading /mnt/hdd2/tan/competition_support_set_sorted/jiang_a549.h5
[6/14] reading /mnt/hdd2/tan/competition_support_set_sorted/jurkat.h5
[7/14] reading /mnt/hdd2/tan/competition_support_set_sorted/jiang_ht29.h5
[8/14] reading /mnt/hdd2/tan/competition_support_set_sorted/jiang_hap1.h5
[9/14] reading /mnt/hdd2/tan/competition_support_set_sorted/hepg2.h5
[10/14] reading /mnt/hdd2/tan/competition_support_set_sorted/competition_train.h5
[11/14] reading /mnt/hdd2/tan/competition_support_set_sorted/jiang_mcf7.h5
[12/14] reading /mnt/hdd2/tan/competition_support_set_sorted/k562_gwps.h5
[13/14] reading /mnt/hdd2/tan/competition_support_set_sorted/k562.h5
[14/14] reading /mnt/hdd2/tan/co

In [8]:
len(hvg)

11431

In [9]:
with open("hvg-all-competition-extended.txt", "w") as f:
    for g in hvg:
        f.write(g + "\n")

In [10]:
with open("hvg-4000-competition-extended.txt", "w") as f:
    for g in hvg[:4000]:
        f.write(g + "\n")