In [None]:
import gzip
import pickle

import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.sparse
import sklearn.preprocessing
from networkx.algorithms.bipartite import biadjacency_matrix
from ALLCools.mcds import MCDS

import scglue

In [None]:
bag = "../download/Yao-2021/MOp_MiniAtlas_2020_bdbag_2021_04_28/data"

# scRNA-seq

## Read data

In [None]:
subbag = "Analysis_10X_cells_v3_AIBS/data/10X_cells_v3_AIBS"

In [None]:
rna = sc.read_10x_h5(f"{bag}/{subbag}/umi_counts.h5")
rna.var_names_make_unique()
rna.obs.index.name, rna.var.index.name = "cells", "genes"
rna

In [None]:
sample_metadata = pd.read_csv(f"{bag}/{subbag}/sample_metadata.csv", index_col=0)
cluster_membership = pd.read_csv(f"{bag}/{subbag}/cluster.membership.csv", index_col=0)
cluster_annotation = pd.read_csv(f"{bag}/{subbag}/cluster.annotation.csv", index_col=0)

In [None]:
sample_metadata.index = sample_metadata.index.str.split("L8TX").map(lambda x: x[0])
cluster_membership.index = cluster_membership.index.str.split("L8TX").map(lambda x: x[0])
for annotation in cluster_annotation.columns:
    cluster_membership[annotation] = cluster_membership["x"].map(cluster_annotation[annotation].to_dict())

In [None]:
s = set(rna.obs_names)
assert all(item in s for item in cluster_membership.index)
s = set(sample_metadata.index)
assert all(item in s for item in cluster_membership.index)

In [None]:
rna = rna[cluster_membership.index]
rna.obs = rna.obs.join(cluster_membership).join(sample_metadata)
rna

## Process meta

In [None]:
rna.obs["domain"] = "scRNA-seq"
rna.obs["protocol"] = "10x RNA"
rna.obs["dataset"] = "Yao-2021-RNA"

In [None]:
rna.obs["cell_type"] = rna.obs["subclass_label"].replace({
    "L5 IT": "L4/5 IT", "L5/6 NP": "NP",
    "L6 IT Car3": "L6 IT", "L6b": "L6 CT"
})
rna.obs["Donor"] = rna.obs["Donor"].astype("category")

In [None]:
scglue.data.get_gene_annotation(
    rna, var_by="gene_ids",
    gtf="../genome/gencode.vM25.chr_patch_hapl_scaff.annotation.gtf.gz", gtf_by="gene_id",
    by_func=scglue.genomics.ens_trim_version
)
rna.var["genome"] = "mm10"

# scATAC-seq

## Read data

In [None]:
subbag = "Analysis_EckerRen_Mouse_MOp_methylation_ATAC/data/EckerRen_Mouse_MOp_methylation_ATAC"
samples = [
    "CEMBA171206_3C", "CEMBA171207_3C", "CEMBA171212_4B",
    "CEMBA171213_4B", "CEMBA180104_4B", "CEMBA180409_2C",
    "CEMBA180410_2C", "CEMBA180612_5D", "CEMBA180618_5D"
]

In [None]:
peak_matrices = []
for sample in samples:
    with gzip.open(f"{bag}/{subbag}/dataset/ATAC/{sample}.pkl.gz") as f:
        d = pickle.load(f)
        peak_matrices.append(anndata.AnnData(
            X=d["X"],
            obs=pd.DataFrame({"sample": sample}, index=d["obs_names"]),
            var=pd.DataFrame(index=d["var_names"])
        ))

In [None]:
for i, peak_matrix in enumerate(peak_matrices):
    peak_matrix.obs_names = peak_matrix.obs["sample"] + "_" + peak_matrix.obs_names
    split = peak_matrix.var_names.str.split("[:-]")
    peak_matrix.var["chrom"] = split.map(lambda x: x[0])
    peak_matrix.var["chromStart"] = split.map(lambda x: x[1])
    peak_matrix.var["chromEnd"] = split.map(lambda x: x[2])
    peak_matrix.var["name"] = peak_matrix.var_names
    scglue.genomics.Bed(peak_matrix.var).write_bed(
        f"{bag}/{subbag}/dataset/ATAC/peaks_{i}.bed",
        ncols=3
    )

In [None]:
!cat {bag}/{subbag}/dataset/ATAC/peaks_*.bed | sort -k1,1 -k2,2n > {bag}/{subbag}/dataset/ATAC/peaks_all.bed
!bedtools merge -i {bag}/{subbag}/dataset/ATAC/peaks_all.bed | sort -k1,1 -k2,2n > {bag}/{subbag}/dataset/ATAC/peaks_merged.bed

In [None]:
merged_peaks = scglue.genomics.Bed.read_bed(f"{bag}/{subbag}/dataset/ATAC/peaks_merged.bed")
merged_peaks["name"] = merged_peaks["chrom"] + ":" \
    + merged_peaks["chromStart"].astype(str) + "-" \
    + merged_peaks["chromEnd"].astype(str)
merged_peaks.index = merged_peaks["name"]
for i, peak_matrix in enumerate(peak_matrices):
    mapping_graph = scglue.genomics.window_graph(
        scglue.genomics.Bed(peak_matrix.var), merged_peaks,
        window_size=0, right_sorted=True
    )
    mapping_matrix = biadjacency_matrix(
        mapping_graph, peak_matrix.var_names, merged_peaks.index
    )
    peak_matrix = anndata.AnnData(
        peak_matrix.X @ mapping_matrix,
        obs=peak_matrix.obs, var=merged_peaks.df.iloc[:, :3]
    )
    peak_matrices[i] = peak_matrix

In [None]:
analysis_result = pd.read_csv(f"{bag}/{subbag}/study/ATAC/MOp.snATAC-seq.AnalysisResult.csv.gz")
analysis_result.index = analysis_result["sample"] + "_" + analysis_result["barcode"]

In [None]:
atac = anndata.concat(peak_matrices, axis=0, merge="same")
atac = atac[analysis_result.index]
del atac.obs["sample"]  # Avoid column overlap
atac.obs = atac.obs.join(analysis_result)
atac.obs.index.name, atac.var.index.name = "cells", "peaks"
atac

## Process meta

In [None]:
atac.obs["domain"] = "scATAC-seq"
atac.obs["protocol"] = "snATAC-seq"
atac.obs["dataset"] = "Yao-2021-ATAC"

In [None]:
atac.obs["cell_type"] = atac.obs["MajorCluster"].replace({
    "L4": "L4/5 IT", "L5.IT.a": "L4/5 IT", "L5.IT.b": "L4/5 IT",
    "L5.PT": "L5 ET", "L6.CT": "L6 CT", "L6.IT": "L6 IT",
    "L23.a": "L2/3 IT", "L23.b": "L2/3 IT", "L23.c": "L2/3 IT",
    "Pv": "Pvalb"
})
mask = atac.obs["cell_type"] == "CGE"
atac.obs["cell_type"].loc[mask] = atac.obs["SubCluster"].loc[mask].str.split("_").map(lambda x: x[0])
atac.obs["batch"] = atac.obs["sample"]

In [None]:
atac.var["genome"] = "mm10"

# FRAGS2RNA

In [None]:
frags2rnas = [
    scglue.data.bedmap2anndata(f"{bag}/{subbag}/dataset/ATAC/{sample}.fragments.bedmap.gz")
    for sample in samples
]

In [None]:
for sample, frags2rna in zip(samples, frags2rnas):
    frags2rna.obs_names = f"{sample}_" + frags2rna.obs_names

In [None]:
frags2rna = anndata.concat(frags2rnas, axis=0, merge="same")
frags2rna = anndata.AnnData(
    frags2rna[atac.obs_names].X,
    obs=atac.obs, var=frags2rna.var
)
frags2rna.obs.index.name, frags2rna.var.index.name = "cells", "genes"
frags2rna

# snmC-seq

## Read data

In [None]:
subbag = "Analysis_EckerRen_Mouse_MOp_methylation_ATAC/data/EckerRen_Mouse_MOp_methylation_ATAC"
samples = [
    "2C-180409", "2C-180410", "3C-171206",
    "3C-171207", "4B-171212", "4B-171213",
    "4B-180104", "5D-180605", "5D-180612"
]

In [None]:
gene_id_mapping = rna.var["gene_ids"].reset_index().set_index("gene_ids")["genes"].to_dict()

In [None]:
met_matrices = []
for sample in samples:
    gene_da = MCDS.open(f"{bag}/{subbag}/dataset/mC/{sample}.mcds")["gene_da"]
    gene_da = gene_da.assign_coords(gene=np.vectorize(scglue.genomics.ens_trim_version)(gene_da.gene))
    
    mCH_mc = gene_da.sel(count_type="mc", mc_type="CHN")
    mCH_c = gene_da.sel(count_type="cov", mc_type="CHN")
    mCG_mc = gene_da.sel(count_type="mc", mc_type="CGN")
    mCG_c = gene_da.sel(count_type="cov", mc_type="CGN")
    
    mCH_rate = (mCH_mc / mCH_c).to_pandas().fillna(0)
    mCG_rate = (mCG_mc / mCG_c).to_pandas().fillna(0)
    mCH_global = (mCH_mc.sum(dim="gene") / mCH_c.sum(dim="gene")).to_pandas()
    mCG_global = (mCG_mc.sum(dim="gene") / mCG_c.sum(dim="gene")).to_pandas()
    mCH_norm = mCH_rate.divide(mCH_global, axis=0)
    mCG_norm = mCG_rate.divide(mCG_global, axis=0)
    
    mapped = gene_da.gene.to_index().map(gene_id_mapping)
    mCH_rate = mCH_rate.loc[:, ~mapped.isna()]
    mCG_rate = mCG_rate.loc[:, ~mapped.isna()]
    mCH_norm = mCH_norm.loc[:, ~mapped.isna()]
    mCG_norm = mCG_norm.loc[:, ~mapped.isna()]
    
    X = np.concatenate([mCH_rate.to_numpy(), mCG_rate.to_numpy()], axis=1)
    X_norm = np.concatenate([mCH_norm.to_numpy(), mCG_norm.to_numpy()], axis=1)
    obs = pd.DataFrame({"sample": sample}, index=gene_da.cell)
    var = rna.var.loc[mapped.dropna(), ["gene_ids"]]
    var = pd.concat([var.set_index(var.index + "_mCH"), var.set_index(var.index + "_mCG")])
    met_matrices.append(anndata.AnnData(X=X, obs=obs, var=var, layers={"norm": X_norm}))

In [None]:
analysis_result = pd.read_csv(
    f"{bag}/{subbag}/study/mC/MOp_clustering/MOp.snmC-seq.AnalysisResult.csv.gz"
).set_index("index")

In [None]:
met = anndata.concat(met_matrices, axis=0, merge="same")
met = met[analysis_result.index]
met.obs = met.obs.join(analysis_result)
met.obs.index.name, met.var.index.name = "cells", "genes"
met

## Process meta

In [None]:
met.obs["domain"] = "snmC-seq"
met.obs["protocol"] = "snmC-seq2"
met.obs["dataset"] = "Yao-2021-MET"

In [None]:
met.obs["cell_type"] = met.obs["MajorCluster"].replace({
    "L23-IT-Cux2": "L2/3 IT", "L4-IT-Rorb": "L4/5 IT", "L5-IT-Deptor": "L4/5 IT",
    "L5-PT-Bcl6": "L5 ET",  "L6-CT-Foxp2": "L6 CT", "L6b-Galnt10": "L6 CT",
    "L6-IT-Sulf1": "L6 IT", "L6-NP-Tshz2": "NP", "MGE-Sst": "Sst", "MGE-Pvalb": "Pvalb"
})
mask = met.obs["cell_type"] == "CGE-VipNdnf"
met.obs["cell_type"].loc[mask] = met.obs["SubCluster"].loc[mask].str.split(r"[-_]").map(lambda x: x[2])

In [None]:
met.var["genome"] = "mm10"

# Clean data

In [None]:
exclude_cell_types = [
    "Low Quality", "doublet",
    "Astro", "Endo", "Macrophage", "OPC", "Oligo", "SMC", "VLMC"
]  # Only keep neurons
retained_cells = rna.obs.query(f"cell_type not in {exclude_cell_types}").index
rna = rna[retained_cells, :]
rna

In [None]:
retained_genes = rna.var.dropna(subset=["chrom", "chromStart", "chromEnd"]).index
rna = rna[:, retained_genes]
rna.var = rna.var.astype({"chromStart": int, "chromEnd": int})
rna

In [None]:
sc.pp.filter_genes(rna, min_counts=1)
rna

In [None]:
exclude_cell_types = [
    "ASC", "Endo", "MGC", "OGC", "OPC", "Other", "Smc"
]  # Only keep neurons
retained_cells = atac.obs.query(f"cell_type not in {exclude_cell_types}").index
atac = atac[retained_cells, :]
atac

In [None]:
blacklist_overlap = scglue.genomics.window_graph(
    scglue.genomics.Bed(atac.var.assign(name=atac.var_names)),
    "../genome/Blacklist/lists/hg38-blacklist.v2.bed.gz",
    window_size=0
)

In [None]:
retained_peaks = np.asarray(biadjacency_matrix(
    blacklist_overlap, atac.var_names
).sum(axis=1)).ravel() == 0
atac = atac[:, retained_peaks]
atac.var = atac.var.astype({"chromStart": int, "chromEnd": int})
atac

In [None]:
sc.pp.filter_genes(atac, min_counts=1)
atac

In [None]:
ohe = sklearn.preprocessing.OneHotEncoder()
donor_ohe = ohe.fit_transform(atac.obs[["batch"]])
atac_per_donor = donor_ohe.T @ atac.X
atac_n_donor = (atac_per_donor > 0).sum(axis=0).A1
atac = atac[:, atac_n_donor > atac_per_donor.shape[0] / 2].copy()
atac.shape

In [None]:
frags2rna = frags2rna[retained_cells, :]
frags2rna

In [None]:
missing_vars = list(set(rna.var_names).difference(frags2rna.var_names))
frags2rna = anndata.concat([
    frags2rna, anndata.AnnData(
        X=scipy.sparse.csr_matrix((frags2rna.shape[0], len(missing_vars))),
        obs=pd.DataFrame(index=frags2rna.obs_names), var=pd.DataFrame(index=missing_vars)
    )
], axis=1, merge="first")
frags2rna = frags2rna[:, rna.var_names].copy()  # Keep the same features as RNA
frags2rna

In [None]:
exclude_cell_types = [
    "Outlier", "NonN"
]  # Only keep neurons
retained_cells = met.obs.query(f"cell_type not in {exclude_cell_types}").index
met = met[retained_cells, :]
met

# Process data

In [None]:
sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")
rna.var.highly_variable.sum()

# Save data

In [None]:
rna.write("../dataset/Yao-2021-RNA.h5ad", compression="gzip")
atac.write("../dataset/Yao-2021-ATAC.h5ad", compression="gzip")
frags2rna.write("../dataset/Yao-2021-FRAGS2RNA.h5ad", compression="gzip")
met.write("../dataset/Yao-2021-MET.h5ad", compression="gzip")