In [None]:
import anndata
import numpy as np
import pandas as pd
import networkx as nx
import scanpy as sc
import scipy.sparse
from networkx.algorithms.bipartite import biadjacency_matrix

import scglue

# Read data

In [None]:
adata = sc.read_10x_h5("../download/10x-Multiome-Pbmc10k/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5", gex_only=False)
adata

In [None]:
rna = adata[:, adata.var["feature_types"] == "Gene Expression"].copy()
rna.obs.index.name, rna.var.index.name = "cells", "genes"
rna

In [None]:
atac = adata[:, adata.var["feature_types"] == "Peaks"].copy()
atac.obs.index.name, atac.var.index.name = "cells", "peaks"
atac

In [None]:
frags2rna = scglue.data.bedmap2anndata("../download/10x-Multiome-Pbmc10k/pbmc_granulocyte_sorted_10k_atac_fragments.bedmap.gz")
frags2rna.obs.index.name, frags2rna.var.index.name = "cells", "genes"
frags2rna

# Process meta

In [None]:
rna.obs["domain"] = "scRNA-seq"
rna.obs["protocol"] = "10x Multiome"
rna.obs["dataset"] = "10x-Multiome-Pbmc10k-RNA"

In [None]:
rna.var_names_make_unique()
scglue.data.get_gene_annotation(
    rna, var_by="gene_ids",
    gtf="../genome/gencode.v35.chr_patch_hapl_scaff.annotation.gtf.gz",
    gtf_by="gene_id", by_func=scglue.genomics.ens_trim_version
)
rna.var.head()

In [None]:
atac.obs["domain"] = "scATAC-seq"
atac.obs["protocol"] = "10x Multiome"
atac.obs["dataset"] = "10x-Multiome-Pbmc10k-ATAC"

In [None]:
atac.var["chrom"] = np.vectorize(lambda x: x.split(":")[0])(atac.var["gene_ids"])
atac.var["chromStart"] = np.vectorize(lambda x: int(x.split(":")[1].split("-")[0]))(atac.var["gene_ids"])
atac.var["chromEnd"] = np.vectorize(lambda x: int(x.split("-")[1]))(atac.var["gene_ids"])
del atac.var["gene_ids"]
atac.var.head()

In [None]:
frags2rna.obs["domain"] = "scATAC-seq"
frags2rna.obs["protocol"] = "10x Multiome"
frags2rna.obs["dataset"] = "10x-Multiome-Pbmc10k-FRAGS2RNA"

# Pair samples & add cell types

In [None]:
meta = pd.read_csv("../download/10x-Multiome-Pbmc10k/wnn_meta_data.csv", index_col=0)
meta = meta.rename(columns={"celltype": "cell_type"})

In [None]:
rna.obs = rna.obs.join(meta)
atac.obs = atac.obs.join(meta)
frags2rna.obs = frags2rna.obs.join(meta)

In [None]:
rna = rna[meta.index, :]
atac = atac[meta.index, :]
frags2rna = frags2rna[meta.index, :]

# Clean data

## Remove doublets

In [None]:
doubletfinder = pd.read_csv("../download/10x-Multiome-Pbmc10k/doubletfinder_inference.csv", index_col=0)
doubletfinder = doubletfinder.loc[rna.obs_names, "doubletfinder"]
mask = (doubletfinder == "Singlet").to_numpy()

In [None]:
rna = rna[mask, :]
rna

In [None]:
atac = atac[mask, :]
atac

In [None]:
frags2rna = frags2rna[mask, :]
frags2rna

## Filter features

In [None]:
retained_genes = rna.var.dropna(subset=["chrom", "chromStart", "chromEnd"]).index
rna = rna[:, retained_genes]
rna.var = rna.var.astype({"chromStart": int, "chromEnd": int})
rna

In [None]:
sc.pp.filter_genes(rna, min_counts=1)
rna

In [None]:
blacklist_overlap = scglue.genomics.window_graph(
    scglue.genomics.Bed(atac.var.assign(name=atac.var_names)),
    "../genome/Blacklist/lists/hg38-blacklist.v2.bed.gz",
    window_size=0
)

In [None]:
retained_peaks = np.asarray(biadjacency_matrix(
    blacklist_overlap, atac.var_names
).sum(axis=1)).ravel() == 0
atac = atac[:, retained_peaks]
atac.var = atac.var.astype({"chromStart": int, "chromEnd": int})
atac

In [None]:
sc.pp.filter_genes(atac, min_counts=1)
atac

In [None]:
missing_vars = list(set(rna.var_names).difference(frags2rna.var_names))
frags2rna = anndata.concat([
    frags2rna, anndata.AnnData(
        X=scipy.sparse.csr_matrix((frags2rna.shape[0], len(missing_vars))),
        obs=pd.DataFrame(index=frags2rna.obs_names), var=pd.DataFrame(index=missing_vars)
    )
], axis=1, merge="first")
frags2rna = frags2rna[:, rna.var_names].copy()  # Keep the same features as RNA
frags2rna

# Process data

In [None]:
sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")
rna.var.highly_variable.sum()

# Save data

In [None]:
rna.write_h5ad("../dataset/10x-Multiome-Pbmc10k-RNA.h5ad", compression="gzip")
atac.write_h5ad("../dataset/10x-Multiome-Pbmc10k-ATAC.h5ad", compression="gzip")
frags2rna.write_h5ad("../dataset/10x-Multiome-Pbmc10k-FRAGS2RNA.h5ad", compression="gzip")