In [None]:
import re
import anndata
import numpy as np
import pandas as pd
import scanpy as sc

import scglue

# Read data

In [None]:
adata = anndata.read_loom(
    "../download/Cao-2020/GSE156793_S3_gene_count.loom",
    sparse=True, obs_names="obs_names", var_names="var_names"
)
adata.obs.index.name, adata.var.index.name = "cells", "genes"
adata

In [None]:
obs_meta = pd.read_csv("../download/Cao-2020/GSE156793_S1_metadata_cells.txt.gz", index_col=0)
obs_meta.head()

# Process meta

In [None]:
pattern = re.compile(r"(-[0-9])+$")
adata.obs.index = np.vectorize(lambda x: pattern.sub("", x))(adata.obs.index)
adata.obs.index.name = "cells"  # Lost from reassignment
assert np.all(adata.obs.index == obs_meta.index)

In [None]:
for col in sorted(set(obs_meta.columns).difference(adata.obs.columns)):
    adata.obs[col] = obs_meta[col]

In [None]:
adata.obs["domain"] = "scRNA-seq"
adata.obs["protocol"] = "sci-RNA-seq3"
adata.obs["dataset"] = "Cao-2020"
adata.obs["cell_type"] = adata.obs["Main_cluster_name"]
adata.obs.head()

In [None]:
adata.var.pop("gene_type")  # Avoid duplicated columns
scglue.data.get_gene_annotation(
    adata, var_by="gene_id",
    gtf="../genome/gencode.v19.chr_patch_hapl_scaff.annotation.gtf.gz",
    gtf_by="gene_id"
)
adata.var["genome"] = "hg19"
adata.var.head()

# Clean data

In [None]:
assert adata.var.dropna(subset=["chrom", "chromStart", "chromEnd"]).shape[0] == adata.var.shape[0]
adata.var = adata.var.astype({"chromStart": int, "chromEnd": int})
adata

In [None]:
sc.pp.filter_genes(adata, min_counts=1)
adata

# Process data

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes=4000, flavor="seurat_v3")
adata.var.highly_variable.sum()

# Save data

In [None]:
adata.write_h5ad("../dataset/Cao-2020.h5ad", compression="gzip")