In [None]:
import gzip
import pathlib
import pickle
import anndata
import networkx as nx
import numpy as np
import pandas as pd
import scanpy as sc
from networkx.algorithms.bipartite import biadjacency_matrix

import scglue

# Read data

In [None]:
adata = []
for fname in sorted(pathlib.Path("../download/Domcke-2020").glob("*.pkl.gz")):
    print(f"Processing {fname}...")
    with gzip.open(fname, "rb") as f:
        d = pickle.load(f)
        adata.append(anndata.AnnData(
            X=d["X"], obs=d["obs"],
            var=pd.DataFrame(index=d["var_names"])
        ))

In [None]:
adata = anndata.AnnData.concatenate(*adata, batch_key="file", index_unique=None)
adata.obs.index.name, adata.var.index.name = "cells", "peaks"
adata

# Process meta

In [None]:
adata.obs["domain"] = "scATAC-seq"
adata.obs["protocol"] = "sci-ATAC-seq3"
adata.obs["dataset"] = "Domcke-2020"
adata.obs.head()

In [None]:
adata.var["chrom"] = np.vectorize(lambda x: x.split("-")[0])(adata.var_names)
adata.var["chromStart"] = np.vectorize(lambda x: int(x.split("-")[1]))(adata.var_names)
adata.var["chromEnd"] = np.vectorize(lambda x: int(x.split("-")[2]))(adata.var_names)
adata.var["genome"] = "hg19"
adata.var.head()

# Clean data

In [None]:
blacklist_overlap = scglue.genomics.window_graph(
    scglue.genomics.Bed(adata.var.assign(name=adata.var_names)),
    "../genome/Blacklist/lists/hg19-blacklist.v2.bed.gz",
    window_size=0
)

In [None]:
retained_peaks = np.asarray(biadjacency_matrix(
    blacklist_overlap, adata.var_names
).sum(axis=1)).ravel() == 0
adata = adata[:, retained_peaks]
adata.var = adata.var.astype({"chromStart": int, "chromEnd": int})
adata

In [None]:
sc.pp.filter_genes(adata, min_counts=1)
adata

# Save data

In [None]:
adata.write_h5ad("../dataset/Domcke-2020.h5ad", compression="gzip")