In [None]:
import numpy as np
import pandas as pd
import scipy.sparse
import anndata

import scglue

# Read data

## mCH

In [None]:
melted_df = pd.read_table("../download/Luo-2017/data/gene_level_mouse.txt.gz").dropna()
obs_names = pd.Index(set(melted_df["samp"]))
var_names = pd.Index(set(melted_df["geneID"]))

In [None]:
ridx = obs_names.get_indexer(melted_df["samp"])
cidx = var_names.get_indexer(melted_df["geneID"])
mCH = scipy.sparse.csr_matrix(
    (melted_df["mCH"], (ridx, cidx))
).astype(np.float32).toarray()
mCH_norm = scipy.sparse.csr_matrix(
    (melted_df["mCH_norm"], (ridx, cidx))
).astype(np.float32).toarray()

## mCG

In [None]:
mCG = pd.read_table(
    "../download/Luo-2017/data/mCG_genebody_mouse.txt.gz",
    index_col=0, dtype={"chr": str}
).drop(columns=["name", "chr", "start", "end", "strand"])

In [None]:
mCG_mc = mCG.loc[var_names, obs_names + "_mc"].to_numpy().T.astype(np.float32)
mCG_c = mCG.loc[var_names, obs_names + "_c"].to_numpy().T.astype(np.float32)
mCG = mCG_mc / mCG_c
mCG[np.isnan(mCG)] = 0.0

In [None]:
mCG_global = mCG_mc.sum(axis=1, keepdims=True) / mCG_c.sum(axis=1, keepdims=True)
mCG_norm = mCG / mCG_global

## Meta data

In [None]:
used_obs_names = [
    obs_name for obs_name in obs_names
    if obs_name.startswith("Pool_") or obs_name.startswith("nuclei")
]
used_ridx = obs_names.get_indexer(used_obs_names)
mCH = mCH[used_ridx, :]
mCH_norm = mCH_norm[used_ridx, :]
mCG = mCG[used_ridx, :]
mCG_norm = mCG_norm[used_ridx, :]

In [None]:
X = np.concatenate([mCH, mCG], axis=1)
X_norm = np.concatenate([mCH_norm, mCG_norm], axis=1)

In [None]:
obs = pd.read_csv("../download/Luo-2017/gene_level_mouse_meta.csv", index_col=0)
obs = obs.loc[used_obs_names, :]
obs.head()

In [None]:
var = melted_df.loc[:, ["geneID", "geneName"]].drop_duplicates()
var = var.set_index("geneID").loc[var_names.tolist(), :].reset_index().set_index("geneName")
var.set_index(anndata.utils.make_index_unique(var.index), inplace=True)
var = pd.concat([var.set_index(var.index + "_mCH"), var.set_index(var.index + "_mCG")])
var.head()

In [None]:
adata = anndata.AnnData(X=X, obs=obs, var=var, layers={"norm": X_norm})
adata.obs.index.name, adata.var.index.name = "cells", "genes"
adata

# Process meta

In [None]:
adata.obs["domain"] = "snmC-seq"
adata.obs["protocol"] = "snmC-seq"
adata.obs["dataset"] = "Luo-2017"

In [None]:
adata.var_names_make_unique()
scglue.data.get_gene_annotation(
    adata, var_by="geneID",
    gtf="../genome/gencode.vM10.chr_patch_hapl_scaff.annotation.gtf.gz",
    gtf_by="gene_id", by_func=scglue.genomics.ens_trim_version
)
adata.var["genome"] = "mm10"
adata.var.head()

# Clean data

In [None]:
retained_genes = adata.var.dropna(subset=["chrom", "chromStart", "chromEnd"]).index
adata = adata[:, retained_genes]
adata.var = adata.var.astype({"chromStart": int, "chromEnd": int})
adata

In [None]:
retained_genes = adata.var_names[adata.X.sum(axis=0) != 0]
adata = adata[:, retained_genes].copy()
adata

# Save data

In [None]:
adata.write_h5ad("../dataset/Luo-2017.h5ad", compression="gzip")