In [None]:
import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.sparse
import sklearn.preprocessing
from networkx.algorithms.bipartite import biadjacency_matrix

import scglue

# scRNA-seq

## Read data

In [None]:
rna = anndata.read_h5ad("../download/Muto-2021/rna.h5ad")
rna

In [None]:
count_matrices = [
    sc.read_10x_h5(f"../download/Muto-2021/{file}")
    for file in [
        "GSM4572192_Control1_filtered_feature_bc_matrix.h5",
        "GSM4572193_Control2_filtered_feature_bc_matrix.h5",
        "GSM4572194_Control3_filtered_feature_bc_matrix.h5",
        "GSM4572195_Control4_filtered_feature_bc_matrix.h5",
        "GSM4572196_Control5_filtered_feature_bc_matrix.h5"
    ]
]

In [None]:
for i, count_matrix in enumerate(count_matrices):
    count_matrix.obs_names = count_matrix.obs_names.str.split("-").map(lambda x: x[0]) + f"-{i+1}"
    count_matrix.var_names_make_unique()

In [None]:
count_matrix = anndata.concat(count_matrices, axis=0, merge="same")
rna = anndata.AnnData(
    count_matrix[rna.obs_names].X,
    obs=rna.obs, var=count_matrix.var,
    obsm=rna.obsm, uns=rna.uns
)
rna.obs.index.name, rna.var.index.name = "cells", "genes"
rna

## Process meta

In [None]:
rna.obs["domain"] = "scRNA-seq"
rna.obs["protocol"] = "10x RNA"
rna.obs["dataset"] = "Muto-2021-RNA"

In [None]:
rna.obs["cell_type"] = rna.obs["author_cell_type"].replace({
    "DCT1": "DCT",
    "DCT2": "DCT",
    "MES": "MES_FIB",
    "FIB": "MES_FIB"
})
rna.obs["batch"] = rna.obs["donor_uuid"]

In [None]:
scglue.data.get_gene_annotation(
    rna, var_by="gene_ids",
    gtf="../genome/gencode.v35.chr_patch_hapl_scaff.annotation.gtf.gz", gtf_by="gene_id",
    by_func=scglue.genomics.ens_trim_version
)
rna.var["genome"] = "hg38"

# scATAC-seq

## Read data

In [None]:
atac = anndata.read_h5ad("../download/Muto-2021/atac.h5ad")
atac

In [None]:
peak_matrices = [
    sc.read_10x_h5(f"../download/Muto-2021/{file}", gex_only=False)
    for file in [
        "GSM4572187_Control1_filtered_peak_bc_matrix.h5",
        "GSM4572188_Control2_filtered_peak_bc_matrix.h5",
        "GSM4572189_Control3_filtered_peak_bc_matrix.h5",
        "GSM4572190_Control4_filtered_peak_bc_matrix.h5",
        "GSM4572191_Control5_filtered_peak_bc_matrix.h5"
    ]
]

In [None]:
for i, peak_matrix in enumerate(peak_matrices):
    peak_matrix.obs_names = peak_matrix.obs_names.str.split("-").map(lambda x: x[0]) + f"-{i+1}"
    split = peak_matrix.var_names.str.split("[:-]")
    peak_matrix.var["chrom"] = split.map(lambda x: x[0])
    peak_matrix.var["chromStart"] = split.map(lambda x: x[1])
    peak_matrix.var["chromEnd"] = split.map(lambda x: x[2])
    peak_matrix.var["name"] = peak_matrix.var_names
    scglue.genomics.Bed(peak_matrix.var).write_bed(
        f"../download/Muto-2021/peaks_{i}.bed",
        ncols=3
    )

In [None]:
%%bash
cat ../download/Muto-2021/peaks_*.bed | sort -k1,1 -k2,2n > ../download/Muto-2021/peaks_all.bed
bedtools merge -i ../download/Muto-2021/peaks_all.bed | sort -k1,1 -k2,2n > ../download/Muto-2021/peaks_merged.bed

In [None]:
merged_peaks = scglue.genomics.Bed.read_bed("../download/Muto-2021/peaks_merged.bed")
merged_peaks["name"] = merged_peaks["chrom"] + ":" \
    + merged_peaks["chromStart"].astype(str) + "-" \
    + merged_peaks["chromEnd"].astype(str)
merged_peaks.index = merged_peaks["name"]
for i, peak_matrix in enumerate(peak_matrices):
    mapping_graph = scglue.genomics.window_graph(
        scglue.genomics.Bed(peak_matrix.var), merged_peaks,
        window_size=0, right_sorted=True
    )
    mapping_matrix = biadjacency_matrix(
        mapping_graph, peak_matrix.var_names, merged_peaks.index
    )
    peak_matrix = anndata.AnnData(
        peak_matrix.X @ mapping_matrix,
        obs=peak_matrix.obs, var=merged_peaks.df.iloc[:, :3]
    )
    peak_matrices[i] = peak_matrix

In [None]:
peak_matrix = anndata.concat(peak_matrices, axis=0, merge="same")
atac = anndata.AnnData(
    peak_matrix[atac.obs_names].X,
    obs=atac.obs, var=peak_matrix.var,
    obsm=atac.obsm, uns=atac.uns
)
atac.obs.index.name, atac.var.index.name = "cells", "peaks"
atac

## Process meta

In [None]:
atac.obs["domain"] = "scATAC-seq"
atac.obs["protocol"] = "10x ATAC"
atac.obs["dataset"] = "Muto-2021-ATAC"

In [None]:
atac.obs["cell_type"] = atac.obs["author_cell_type"].replace({
    "PCT": "PT",
    "PST": "PT"
})
atac.obs["batch"] = atac.obs["donor_uuid"]

In [None]:
atac.var["genome"] = "hg38"

# FRAGS2RNA

In [None]:
frags2rnas = [
    scglue.data.bedmap2anndata(f"../download/Muto-2021/{file}")
    for file in [
        "GSM4572187_Control1_fragments.bedmap.gz",
        "GSM4572188_Control2_fragments.bedmap.gz",
        "GSM4572189_Control3_fragments.bedmap.gz",
        "GSM4572190_Control4_fragments.bedmap.gz",
        "GSM4572191_Control5_fragments.bedmap.gz"
    ]
]

In [None]:
for i, frags2rna in enumerate(frags2rnas):
    frags2rna.obs_names = frags2rna.obs_names.str.split("-").map(lambda x: x[0]) + f"-{i+1}"

In [None]:
frags2rna = anndata.concat(frags2rnas, axis=0, merge="same")
frags2rna = anndata.AnnData(
    frags2rna[atac.obs_names].X,
    obs=atac.obs, var=frags2rna.var,
    obsm=atac.obsm, uns=atac.uns
)
frags2rna.obs.index.name, frags2rna.var.index.name = "cells", "genes"
frags2rna

# Clean data

In [None]:
retained_genes = rna.var.dropna(subset=["chrom", "chromStart", "chromEnd"]).index
rna = rna[:, retained_genes]
rna.var = rna.var.astype({"chromStart": int, "chromEnd": int})
rna

In [None]:
sc.pp.filter_genes(rna, min_counts=1)
rna

In [None]:
doublets = set()
amulets = [
    "GSM4572187_Control1_AMULET",
    "GSM4572188_Control2_AMULET",
    "GSM4572189_Control3_AMULET",
    "GSM4572190_Control4_AMULET",
    "GSM4572191_Control5_AMULET"
]
for i, amulet in enumerate(amulets):
    doublets = doublets.union(pd.read_table(
        f"../download/Muto-2021/{amulet}/MultipletProbabilities.txt"
    ).query("`p-value` < 0.05")["barcode"].str.replace("-1", f"-{i+1}"))
retained_cells = atac.obs_names[[item not in doublets for item in atac.obs_names]]
atac = atac[retained_cells, :]
atac

In [None]:
blacklist_overlap = scglue.genomics.window_graph(
    scglue.genomics.Bed(atac.var.assign(name=atac.var_names)),
    "../genome/Blacklist/lists/hg38-blacklist.v2.bed.gz",
    window_size=0
)

In [None]:
retained_peaks = np.asarray(biadjacency_matrix(
    blacklist_overlap, atac.var_names
).sum(axis=1)).ravel() == 0
atac = atac[:, retained_peaks]
atac.var = atac.var.astype({"chromStart": int, "chromEnd": int})
atac

In [None]:
sc.pp.filter_genes(atac, min_counts=1)
atac

In [None]:
ohe = sklearn.preprocessing.OneHotEncoder()
donor_ohe = ohe.fit_transform(atac.obs[["batch"]])
atac_per_donor = donor_ohe.T @ atac.X
atac_n_donor = (atac_per_donor > 0).sum(axis=0).A1
atac = atac[:, atac_n_donor == atac_per_donor.shape[0]].copy()
atac.shape

In [None]:
frags2rna = frags2rna[retained_cells, :]
frags2rna

In [None]:
missing_vars = list(set(rna.var_names).difference(frags2rna.var_names))
frags2rna = anndata.concat([
    frags2rna, anndata.AnnData(
        X=scipy.sparse.csr_matrix((frags2rna.shape[0], len(missing_vars))),
        obs=pd.DataFrame(index=frags2rna.obs_names), var=pd.DataFrame(index=missing_vars)
    )
], axis=1, merge="first")
frags2rna = frags2rna[:, rna.var_names].copy()  # Keep the same features as RNA
frags2rna

# Process data

In [None]:
sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3", batch_key="batch")
rna.var.highly_variable.sum()

# Save data

In [None]:
rna.write("../dataset/Muto-2021-RNA.h5ad", compression="gzip")
atac.write("../dataset/Muto-2021-ATAC.h5ad", compression="gzip")
frags2rna.write("../dataset/Muto-2021-FRAGS2RNA.h5ad", compression="gzip")