In [None]:
import numpy as np
import pandas as pd
import scipy.io
import anndata
import scanpy as sc

import scglue

# Read data

In [None]:
X = scipy.io.mmread("../download/Saunders-2018/F_GRCm38.81.P60Cortex_noRep5_FRONTALonly.raw.dge.mtx.gz").T.tocsr()

In [None]:
obs_names = pd.read_table(
    "../download/Saunders-2018/F_GRCm38.81.P60Cortex_noRep5_FRONTALonly.raw.dge.colnames",
    header=None
).to_numpy().ravel()
var_names = pd.read_table(
    "../download/Saunders-2018/F_GRCm38.81.P60Cortex_noRep5_FRONTALonly.raw.dge.rownames",
    header=None
).to_numpy().ravel()

In [None]:
obs = pd.read_csv(
    "../download/Saunders-2018/F_GRCm38.81.P60Cortex_noRep5_FRONTALonly.cell_cluster_outcomes.csv",
    index_col=0
).loc[obs_names, :]

In [None]:
adata = anndata.AnnData(X=X, obs=obs, var=pd.DataFrame(index=var_names))
adata.obs.index.name, adata.var.index.name = "cells", "genes"
adata

In [None]:
fc_neuron_annotation = pd.read_excel(
    "../download/Saunders-2018/annotation.BrainCellAtlas_Saunders_version_2018.04.01.xlsx",
    engine="openpyxl"
).query("tissue == 'FC' & `class` == 'NEURON'")
fc_neuron_annotation.head(n=2)

# Process meta

In [None]:
subclusters = set(adata.obs["subcluster"])
subcluster_annotation_map = {
    subcluster: common_name for subcluster, common_name in
    zip(fc_neuron_annotation["subcluster"], fc_neuron_annotation["common_name"])
    if subcluster in subclusters
}

In [None]:
broad_annotation_map = subcluster_annotation_map.copy()
for key, val in broad_annotation_map.items():
    if key.startswith("1-"):
        broad_annotation_map[key] = "CGE"
    elif key.startswith("2-"):
        broad_annotation_map[key] = "MGE"
    elif key.startswith("3-"):
        broad_annotation_map[key] = "Layer6"
    elif key.startswith("4-"):
        broad_annotation_map[key] = "Layer5b"
    elif key.startswith("5-"):
        broad_annotation_map[key] = "Claustrum"
    elif key in ("6-1", "6-2"):
        broad_annotation_map[key] = "Layer2/3"
    elif key.startswith("6-"):
        broad_annotation_map[key] = "Layer5a"
    elif key.startswith("7-"):
        broad_annotation_map[key] = "Layer5"
    else:  # including 11-2
        broad_annotation_map[key] = np.nan

In [None]:
adata.obs["cell_type"] = [
    broad_annotation_map[subcluster]
    if subcluster in broad_annotation_map else np.nan
    for subcluster in adata.obs["subcluster"]
]
adata.obs["cell_subtype"] = [
    subcluster_annotation_map[subcluster]
    if subcluster in subcluster_annotation_map else np.nan
    for subcluster in adata.obs["subcluster"]
]
adata.obs["domain"] = "scRNA-seq"
adata.obs["protocol"] = "Drop-seq"
adata.obs["dataset"] = "Saunders-2018"
adata.obs.head(n=2)

In [None]:
scglue.data.get_gene_annotation(
    adata,
    gtf="../genome/gencode.vM25.chr_patch_hapl_scaff.annotation.gtf.gz",
    gtf_by="gene_name"
)
adata.var["genome"] = "mm10"
adata.var.head(n=2)

# Clean data

In [None]:
retained_cells = adata.obs.dropna(subset=["cell_type", "cell_subtype"]).index
adata = adata[retained_cells, :]
adata

In [None]:
retained_genes = adata.var.dropna(subset=["chrom", "chromStart", "chromEnd"]).index
adata = adata[:, retained_genes]
adata.var = adata.var.astype({"chromStart": int, "chromEnd": int})
adata

In [None]:
sc.pp.filter_genes(adata, min_counts=1)
adata

# Process data

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="seurat_v3")
adata.var.highly_variable.sum()

# Save data

In [None]:
adata.write_h5ad("../dataset/Saunders-2018.h5ad", compression="gzip")