In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import scipy.io
import anndata
import scanpy as sc
from networkx.algorithms.bipartite import biadjacency_matrix

import scglue

# scRNA-seq

## Read data

In [None]:
rna_counts = pd.read_table("../download/Ma-2020/GSM4156608_skin.late.anagen.rna.counts.txt.gz", index_col=0)
rna_obs = pd.DataFrame(index=rna_counts.columns)
rna_obs.index = rna_obs.index.str.replace(",", ".")
rna_var = pd.DataFrame(index=rna_counts.index)
rna_obs.index.name, rna_var.index.name = "cells", "genes"
rna = anndata.AnnData(
    X=scipy.sparse.csr_matrix(rna_counts.to_numpy().T),
    obs=rna_obs,
    var=rna_var
)
rna

## Process meta

In [None]:
rna.obs["domain"] = "scRNA-seq"
rna.obs["protocol"] = "SHARE-seq"
rna.obs["dataset"] = "Ma-2020-RNA"

In [None]:
scglue.data.get_gene_annotation(
    rna, gtf="../genome/gencode.vM25.chr_patch_hapl_scaff.annotation.gtf.gz", gtf_by="gene_name"
)
rna.var["genome"] = "mm10"

# ATAC

## Read data

In [None]:
atac_counts = scipy.io.mmread("../download/Ma-2020/GSM4156597_skin.late.anagen.counts.txt.gz")
atac_obs = pd.read_table(
    "../download/Ma-2020/GSM4156597_skin.late.anagen.barcodes.txt.gz",
    header=None, names=["Cells"], index_col=0
)
atac_var = pd.read_table(
    "../download/Ma-2020/GSM4156597_skin.late.anagen.peaks.bed.gz",
    header=None, names=["chrom", "chromStart", "chromEnd"]
)
atac_obs.index.name, atac_var.index.name = "cells", "peaks"
atac = anndata.AnnData(
    X=atac_counts.T.tocsr(),
    obs=atac_obs,
    var=atac_var
)
atac

## Process meta

In [None]:
atac.obs["domain"] = "scATAC-seq"
atac.obs["protocol"] = "SHARE-seq"
atac.obs["dataset"] = "Ma-2020-ATAC"

In [None]:
atac.var.index = pd.Index(
    atac.var["chrom"] + ":" +
    atac.var["chromStart"].astype(str) + "-" +
    atac.var["chromEnd"].astype(str),
    name=atac.var.index.name
)
atac.var["genome"] = "mm10"

# FRAGS2RNA

In [None]:
frags2rna = scglue.data.bedmap2anndata("../download/Ma-2020/GSM4156597_skin.late.anagen.atac.fragments.bedmap.gz")
frags2rna.obs.index = frags2rna.obs.index.str.replace(",", ".")
frags2rna.obs.index.name, frags2rna.var.index.name = "cells", "genes"
frags2rna

In [None]:
frags2rna.obs["domain"] = "scATAC-seq"
frags2rna.obs["protocol"] = "SHARE-seq"
frags2rna.obs["dataset"] = "Ma-2020-FRAGS2RNA"

# Pair samples & add cell types

In [None]:
cell_type = pd.read_table("../download/Ma-2020/celltype_v2.txt")
cell_type.shape

In [None]:
cell_type["celltype"] = cell_type["celltype"].replace({
    "Dermal Fibrobalst": "Dermal Fibroblast",
    "Hair Shaft-cuticle.cortex": "Hair Shaft-Cuticle/Cortex",
    "K6+ Bulge Companion Layer": "K6+ Bulge/Companion Layer",
    "ahighCD34+ bulge": "ahigh CD34+ bulge",
    "alowCD34+ bulge": "alow CD34+ bulge"
})
cell_type = cell_type.query("celltype != 'Mix'")
cell_type.shape

ATAC barcodes do not match, need some conversion...

In [None]:
atac_bc_map = {
    "04": "53",
    "05": "53",
    "06": "54",
    "07": "55",
    "08": "56"
}

@np.vectorize
def map_atac_bc(x):
    xs = x.split(".")
    xs[-1] = atac_bc_map[xs[-1]]
    return ".".join(xs)

cell_type["atac.bc.mapped"] = map_atac_bc(cell_type["atac.bc"])

In [None]:
rna = rna[cell_type["rna.bc"].to_numpy(), :]
rna.obs["cell_type"] = cell_type["celltype"].to_numpy()
rna.obs["batch"] = rna.obs_names.str.split(".").map(lambda x: x[-1])

In [None]:
atac = atac[cell_type["atac.bc.mapped"].to_numpy(), :]
atac.obs["cell_type"] = cell_type["celltype"].to_numpy()
atac.obs["batch"] = atac.obs_names.str.split(".").map(lambda x: x[-1])

In [None]:
frags2rna = frags2rna[cell_type["atac.bc"].to_numpy(), :]
frags2rna.obs["cell_type"] = cell_type["celltype"].to_numpy()
frags2rna.obs.index = atac.obs.index
frags2rna.obs["batch"] = frags2rna.obs_names.str.split(".").map(lambda x: x[-1])

# Clean data

In [None]:
retained_genes = rna.var.dropna(subset=["chrom", "chromStart", "chromEnd"]).index
rna = rna[:, retained_genes]
rna.var = rna.var.astype({"chromStart": int, "chromEnd": int})
rna

In [None]:
sc.pp.filter_genes(rna, min_counts=1)
rna

In [None]:
blacklist_overlap = scglue.genomics.window_graph(
    scglue.genomics.Bed(atac.var.assign(name=atac.var_names)),
    "../genome/Blacklist/lists/mm10-blacklist.v2.bed.gz",
    window_size=0
)

In [None]:
retained_peaks = np.asarray(biadjacency_matrix(
    blacklist_overlap, atac.var_names
).sum(axis=1)).ravel() == 0
atac = atac[:, retained_peaks]
atac.var = atac.var.astype({"chromStart": int, "chromEnd": int})
atac

In [None]:
sc.pp.filter_genes(atac, min_counts=1)
atac

In [None]:
missing_vars = list(set(rna.var_names).difference(frags2rna.var_names))
frags2rna = anndata.concat([
    frags2rna, anndata.AnnData(
        X=scipy.sparse.csr_matrix((frags2rna.shape[0], len(missing_vars))),
        obs=pd.DataFrame(index=frags2rna.obs_names), var=pd.DataFrame(index=missing_vars)
    )
], axis=1, merge="first")
frags2rna = frags2rna[:, rna.var_names].copy()  # Keep the same features as RNA
frags2rna

# Process data

In [None]:
sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")
rna.var.highly_variable.sum()

# Save data

In [None]:
rna.write("../dataset/Ma-2020-RNA.h5ad", compression="gzip")
atac.write("../dataset/Ma-2020-ATAC.h5ad", compression="gzip")
frags2rna.write("../dataset/Ma-2020-FRAGS2RNA.h5ad", compression="gzip")