In [None]:
%load_ext pretty_jupyter

In [None]:
import sys
import scvi
import cellxgene_census
import numpy as np
import pandas as pd
import scanpy as sc
import tomlkit
import logging
from os import path, system
from matplotlib import rc_context
import matplotlib.pyplot as plt

logging.basicConfig(level=logging.ERROR)


In [None]:
scGPT_model = "../resources/scGPT_CP/"
scGPT_index = "../resources/CellXGene_faiss_index/"
scGPT_column = "scGPT"

In [None]:
## Pipeline parameters
with open("../config.toml", "r") as f:
    config = tomlkit.parse(f.read())

In [None]:
ROOT_DIR = config["basic"]["ANALYSIS_DIR"]
DIR_SAVE = path.join(ROOT_DIR, config["basic"]["DIR_SAVE"])
TISSUE = config["basic"]["TISSUE"]
ANNOTATION_METHOD = config["annotation"]["ANNOTATION_METHOD"]
COUNTS_LAYER = config["normalization"]["COUNTS_LAYER"]
NORMAMALIZATION_LAYER = config["normalization"]["NORMALIZATION_METHOD"]
ANNOTATION_METHOD = config["annotation"]["ANNOTATION_METHOD"]

In [None]:
adata = sc.read_h5ad(path.join(DIR_SAVE, "adata.h5ad"))

# scGPT

In [None]:
# Run the scGPT annotation in a seperate process to avoid blocking the GPU resources in interactive mode
if ANNOTATION_METHOD == "scGPT":
    system(f"python utils/scgpt_annotate.py -i {path.join(DIR_SAVE, 'adata.h5ad')} -c {scGPT_column} -m {scGPT_model} -n {scGPT_index} -o {path.join(DIR_SAVE, 'adata.h5ad')}")
    adata = sc.read_h5ad(path.join(DIR_SAVE, "adata.h5ad"))
    with rc_context({"figure.figsize": (8,8)}):
        sc.pl.umap(adata, color=[scGPT_column])
        plt.tight_layout()


# scVI + cellxgene

In [None]:
import cellxgene_census
import cellxgene_census.experimental


In [None]:
census_version = "2023-12-15"
organism = "homo_sapiens"

census = cellxgene_census.open_soma(census_version=census_version)

scvi_info = cellxgene_census.experimental.get_embedding_metadata_by_name(
    embedding_name="scvi",
    organism=organism,
    census_version=census_version,
)


In [None]:
import warnings

warnings.filterwarnings("ignore")

import anndata
import cellxgene_census
import numpy as np
import scanpy as sc
import scvi
from sklearn.ensemble import RandomForestClassifier

In [None]:
!mkdir -p data
!wget -nv -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz
!tar -xzf data/pbmc3k_filtered_gene_bc_matrices.tar.gz -C data/


In [None]:
adata = sc.read_10x_mtx("data/filtered_gene_bc_matrices/hg19/", var_names="gene_ids")


In [None]:
census = cellxgene_census.open_soma(census_version="2023-12-15")

# Some PBMC data from these collections
# 1. https://cellxgene.cziscience.com/collections/c697eaaf-a3be-4251-b036-5f9052179e70
# 2. https://cellxgene.cziscience.com/collections/f2a488bf-782f-4c20-a8e5-cb34d48c1f7e
dataset_ids = [
    "fa8605cf-f27e-44af-ac2a-476bee4410d3",
    "3c75a463-6a87-4132-83a8-c3002624394d",
]

adata_census = cellxgene_census.get_anndata(
    census=census,
    measurement_name="RNA",
    organism="Homo sapiens",
    obs_value_filter=f"dataset_id in {dataset_ids}",
    obs_embeddings=["scvi"],
)
adata_census.var.set_index("feature_id", inplace=True)


In [None]:
adata_census.obsm["scvi"]

In [None]:
adata.obs["dataset_id"] = "QUERY"
# Subset the reference dataset to have a similar number of cells to the query dataset
index_subset = np.random.choice(adata_census.n_obs, size=3000, replace=False)
adata_census_subset = adata_census[index_subset, :]


In [None]:

adata_combined = anndata.concat([adata_census_subset, adata])
sc.pp.neighbors(adata_combined, n_neighbors=15, use_rep="scvi", metric="correlation")
sc.tl.umap(adata_combined)
sc.pl.umap(adata_combined, color=["dataset_id"])


In [None]:
rfc = RandomForestClassifier()
rfc.fit(adata_census.obsm["scvi"], adata_census.obs["cell_type"].values)
adata.obs["predicted_cell_type"] = rfc.predict(adata.obsm["scvi"])

# let's get confidence scores
probabilities = rfc.predict_proba(adata.obsm["scvi"])
confidence = np.zeros(adata.n_obs)
for i in range(adata.n_obs):
    confidence[i] = probabilities[i][rfc.classes_ == adata.obs["predicted_cell_type"][i]]


In [None]:

vae_q.is_trained = True
latent = vae_q.get_latent_representation()
adata.obsm["scvi"] = latent

# filter out missing features
adata = adata[:, adata.var["gene_names"].notnull().values].copy()
adata.var.set_index("gene_names", inplace=True)

In [None]:
# -.-|m { input: false, output: false, input_fold: show}

import tomlkit
import scanpy as sc
from anndata import AnnData
import pandas as pd
import numpy as np
import scvi
import seaborn as sns
from pandas import DataFrame

from os import path
import session_info
import logging
from tempfile import TemporaryDirectory 
from os import system

logging.basicConfig(level=logging.ERROR)

scvi.settings.seed = 0
sc.set_figure_params(figsize=(6, 6), frameon=False)


In [None]:
CELL_TYPIST_MODELS: list[str] = []

In [None]:
def scvi_cellassign_annotate(adata: AnnData, marker_matrix: DataFrame, subsample: int = 2000):

    import torch
    from scvi.external import CellAssign

    torch.set_float32_matmul_precision("high")

    markers_common = list(set(adata.var_names).intersection(set(marker_matrix.index)))
    adata_annotate = sc.pp.subsample(adata, n_obs = subsample, copy = True)
    
    adata_annotate = adata_annotate[:, list(markers_common)].copy()

    adata_annotate.X = adata_annotate.layers[COUNTS_LAYER]
    lib_size = adata_annotate.layers[COUNTS_LAYER].sum(1)
    adata_annotate.obs["size_factor"] = lib_size / np.mean(lib_size)

    scvi.external.CellAssign.setup_anndata(adata_annotate, size_factor_key="size_factor")
    cellassign_model = CellAssign(adata_annotate, marker_matrix)

    return cellassign_model, adata_annotate


def convert_scHCA(tissue: str, quantile: float = 0.98):
    df = pd.read_csv("../resources/scHCL_DB.csv", index_col=0)
    tissue_df = df.filter(regex=tissue)
    quantiles = tissue_df.quantile(q=quantile, axis=0)
    binary_markers = tissue_df.gt(quantiles, axis=1).astype("int")
    
    # Remove markers where it is all 0 or 1
    binary_markers = binary_markers.loc[~(binary_markers == 0).all(axis=1)]
    binary_markers = binary_markers.loc[~(binary_markers == 1).all(axis=1)]

    return binary_markers


def cell_typist_annotate(adata: AnnData, models: list[str], inplace = True):
    import celltypist
    from celltypist import models as ctypist_models

    ctypist_models.download_models(force_update=True, model=models)


    adata_celltypist = adata.copy()  
    adata_celltypist.X = adata.layers[COUNTS_LAYER]  
    sc.pp.normalize_per_cell(adata_celltypist, counts_per_cell_after=10**4)  
    sc.pp.log1p(adata_celltypist)
    adata_celltypist.X = adata_celltypist.X.toarray()

    for model in models:
        loaded_model = ctypist_models.Model.load(model = model)
        predictions = celltypist.annotate(adata_celltypist, model=loaded_model, majority_voting=True)
        predictions_adata = predictions.to_adata()
        adata.obs["celltypist_"+model+"_label"] = predictions_adata.obs.loc[adata.obs.index, "majority_voting"]
        adata.obs["celltypist_"+model+"_conf_score"] = predictions_adata.obs.loc[adata.obs.index, "conf_score"]

    if not inplace:
        return adata


def annotate_scGPT(adata: AnnData):
    pass



In [None]:
## Pipeline parameters
with open("../config.toml", "r") as f:
    config = tomlkit.parse(f.read())

In [None]:
ROOT_DIR = config["basic"]["ANALYSIS_DIR"]
DIR_SAVE = path.join(ROOT_DIR, config["basic"]["DIR_SAVE"])
TISSUE = config["basic"]["TISSUE"]
ANNOTATION_METHOD = config["annotation"]["ANNOTATION_METHOD"]
COUNTS_LAYER = config["normalization"]["COUNTS_LAYER"]
NORMAMALIZATION_LAYER = config["normalization"]["NORMALIZATION_METHOD"]

In [None]:
#adata = sc.read_h5ad(path.join(DIR_SAVE, "adata.h5ad"))
adata = sc.read_h5ad("../save/marcelo_ref.h5ad")

In [None]:
if ANNOTATION_METHOD == "scvi_cellassign":
    markers = convert_scHCA(TISSUE)
    model, adata_annotated = scvi_cellassign_annotate(adata, markers, subsample=100)
    model.train(batch_size = 8192, max_epochs=400)

    model.history["elbo_validation"].plot()

    predictions = model.predict()
    predictions.head()

    sns.clustermap(predictions, cmap="viridis")
    adata_annotated.obs["cellassign_predictions"] = predictions.idxmax(axis=1).values


if ANNOTATION_METHOD == "celltypist":
    cell_typist_annotate(adata, CELL_TYPIST_MODELS)