In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import scanpy as sc
from networkx.algorithms.bipartite import biadjacency_matrix

import scglue

# Read data

In [None]:
adata = sc.read_10x_h5("../download/10x-ATAC-Brain5k/atac_v1_adult_brain_fresh_5k_filtered_peak_bc_matrix.h5", gex_only=False)
adata.obs.index.name, adata.var.index.name = "cells", "peaks"
adata

In [None]:
obs_meta = pd.read_csv(
    "../download/10x-ATAC-Brain5k/signac_idents.csv", index_col=0
).rename(
    columns={"Idents": "cell_type"}
).query(
    "cell_type not in ['Astro', 'Endo', 'Macrophage', 'Oligo', 'Meis2', 'VLMC']"
)

# Process meta

In [None]:
adata.obs = adata.obs.join(obs_meta)
adata.obs["domain"] = "scATAC-seq"
adata.obs["protocol"] = "10x ATAC"
adata.obs["dataset"] = "10x-ATAC-Brain5k"
adata.obs.head()

In [None]:
adata.var["chrom"] = np.vectorize(lambda x: x.split(":")[0])(adata.var["gene_ids"])
adata.var["chromStart"] = np.vectorize(lambda x: int(x.split(":")[1].split("-")[0]))(adata.var["gene_ids"])
adata.var["chromEnd"] = np.vectorize(lambda x: int(x.split("-")[1]))(adata.var["gene_ids"])
del adata.var["gene_ids"]
adata.var.head()

# Clean data

In [None]:
retained_cells = adata.obs.dropna().index
adata = adata[retained_cells, :]
adata

In [None]:
blacklist_overlap = scglue.genomics.window_graph(
    scglue.genomics.Bed(adata.var.assign(name=adata.var_names)),
    "../genome/Blacklist/lists/mm10-blacklist.v2.bed.gz",
    window_size=0
)

In [None]:
retained_peaks = np.asarray(biadjacency_matrix(
    blacklist_overlap, adata.var_names
).sum(axis=1)).ravel() == 0
adata = adata[:, retained_peaks]
adata.var = adata.var.astype({"chromStart": int, "chromEnd": int})
adata

In [None]:
sc.pp.filter_genes(adata, min_counts=1)
adata

# Save data

In [None]:
adata.write_h5ad("../dataset/10x-ATAC-Brain5k.h5ad", compression="gzip")