In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import scipy.io
import anndata
import scanpy as sc
from networkx.algorithms.bipartite import biadjacency_matrix

import scglue

# scRNA-seq

## Read data

In [None]:
rna_matrix = scipy.io.mmread("../download/Chen-2019/GSE126074_AdBrainCortex_SNAREseq_cDNA.counts.mtx.gz").T.tocsr()
rna_obs = pd.read_csv("../download/Chen-2019/GSE126074_AdBrainCortex_SNAREseq_cDNA.barcodes.tsv.gz", header=None, index_col=0)
rna_var = pd.read_csv("../download/Chen-2019/GSE126074_AdBrainCortex_SNAREseq_cDNA.genes.tsv.gz", header=None, index_col=0)
rna_obs.index.name, rna_var.index.name = "cells", "genes"
rna = anndata.AnnData(X=rna_matrix, obs=rna_obs, var=rna_var)
rna

## Process meta

In [None]:
rna.obs["domain"] = "scRNA-seq"
rna.obs["protocol"] = "SNARE-seq"
rna.obs["dataset"] = "Chen-2019-RNA"

In [None]:
scglue.data.get_gene_annotation(
    rna, gtf="../genome/gencode.vM25.chr_patch_hapl_scaff.annotation.gtf.gz",
    gtf_by="gene_name"
)
rna.var["genome"] = "mm10"

# scATAC-seq

## Read data

In [None]:
atac_matrix = scipy.io.mmread("../download/Chen-2019/GSE126074_AdBrainCortex_SNAREseq_chromatin.counts.mtx.gz").T.tocsr()
atac_obs = pd.read_csv("../download/Chen-2019/GSE126074_AdBrainCortex_SNAREseq_chromatin.barcodes.tsv.gz", header=None, index_col=0)
atac_var = pd.read_csv("../download/Chen-2019/GSE126074_AdBrainCortex_SNAREseq_chromatin.peaks.tsv.gz", header=None, index_col=0)
atac_obs.index.name, atac_var.index.name = "cells", "peaks"
atac = anndata.AnnData(X=atac_matrix, obs=atac_obs, var=atac_var)
atac

## Process meta

In [None]:
atac.obs["domain"] = "scATAC-seq"
atac.obs["protocol"] = "SNARE-seq"
atac.obs["dataset"] = "Chen-2019-ATAC"

In [None]:
atac.var["chrom"] = np.vectorize(lambda x: x.split(":")[0])(atac.var_names)
atac.var["chromStart"] = np.vectorize(lambda x: x.split(":")[1].split("-")[0])(atac.var_names).astype(int)
atac.var["chromEnd"] = np.vectorize(lambda x: x.split("-")[1])(atac.var_names).astype(int)
atac.var["genome"] = "mm10"

# Pair samples & add cell types

In [None]:
meta = pd.read_csv("../download/Chen-2019/AdBrainCortex_SNAREseq_metadata.csv", index_col=0)
meta = meta.query("Ident != 'Mis'")
meta.head()

In [None]:
rna = rna[meta.index, :]
atac = atac[meta.index, :]

In [None]:
rna.obs["cell_type"] = meta["Ident"]
atac.obs["cell_type"] = meta["Ident"]

# Clean data

In [None]:
retained_genes = rna.var.dropna(subset=["chrom", "chromStart", "chromEnd"]).index
rna = rna[:, retained_genes]
rna.var = rna.var.astype({"chromStart": int, "chromEnd": int})
rna

In [None]:
sc.pp.filter_genes(rna, min_counts=1)
rna

In [None]:
blacklist_overlap = scglue.genomics.window_graph(
    scglue.genomics.Bed(atac.var.assign(name=atac.var_names)),
    "../genome/Blacklist/lists/mm10-blacklist.v2.bed.gz",
    window_size=0
)

In [None]:
retained_peaks = np.asarray(biadjacency_matrix(
    blacklist_overlap, atac.var_names
).sum(axis=1)).ravel() == 0
atac = atac[:, retained_peaks]
atac.var = atac.var.astype({"chromStart": int, "chromEnd": int})
atac

In [None]:
sc.pp.filter_genes(atac, min_counts=1)
atac

# Process data

In [None]:
sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")
rna.var.highly_variable.sum()

# Save data

In [None]:
rna.write("../dataset/Chen-2019-RNA.h5ad", compression="gzip")
atac.write("../dataset/Chen-2019-ATAC.h5ad", compression="gzip")
!touch ../dataset/Chen-2019-FRAGS2RNA.h5ad  # Sham file