# This notebook must be run with the scvi conda environment within the scvi.sif singularity container

In [None]:
# scVI and scANVI Integration of Data
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scvi
import anndata as ad

Whole brain atlas of the adolescent mouse brain was downloaded from this endpoint: http://mousebrain.org/adolescent/downloads.html

In [None]:
# first must convert the .loom file to a .h5ad for future use
base_path = '/hpc/projects/group.quake/doug/UCE/data/'
file_name = 'l5_all.loom'
output_name = 'l5_all.h5ad'
l5_brain = sc.read_loom(base_path+file_name)
l5_brain.obs['celltype'] = l5_brain.obs.Taxonomy_group
l5_brain.write_h5ad(base_path+output_name)

In [None]:
whole = sc.read_h5ad(base_path+output_name)
whole.obs['batch'] = whole.obs.DonorID

#merged_ad_vize just see what happens if we subset to highly variable
whole.layers['counts'] = whole.X

#merged_ad_viz = merged_ad_viz[:, merged_ad_viz.var.highly_variable]
sc.pp.normalize_total(whole)
sc.pp.log1p(whole)
sc.pp.highly_variable_genes(whole, min_mean=0.0125, max_mean=3, min_disp=0.5)

sc.pp.neighbors(whole)
sc.tl.umap(whole)

ad_viz = sc.read_h5ad('../Baysor/baysor_segmented_6-5_micron.h5ad')

In [None]:
genes_Vizgen = ad_viz.var.index
genes_10x = whole.var.index
genes_shared = genes_Vizgen.intersection(genes_10x) # List of shared genes

ad_viz = ad_viz[:, genes_Vizgen.isin(genes_shared)].copy()
whole = whole[:, genes_10x.isin(genes_shared)]

ad_viz.obs['tech']='Vizgen'
whole.obs['tech']='10x'

ad_viz.obs_names_make_unique()
whole.obs_names_make_unique()
ad_viz.var_names_make_unique()
whole.var_names_make_unique()

In [None]:
brain_concat = ad.concat([whole, ad_viz], join='outer', label='tech', keys=['10x', 'Vizgen'])

# Use the annotations from the 10x, and treat the MERFISH as unlabeled
brain_concat.obs['celltype_scanvi'] = 'Unknown'
brain_10x_mask = brain_concat.obs['tech'] == '10x'

brain_concat.obs['celltype_scanvi'][brain_10x_mask] = brain_concat.obs[
    'celltype'][brain_10x_mask].values


# Create the scVI latent space
scvi.model.SCVI.setup_anndata(brain_concat, layer="counts", batch_key="tech")
vae_brain = scvi.model.SCVI(brain_concat)

# Train the brain model
vae_brain.train()

In [None]:
# Register the object and run scANVI
scvi.model.SCANVI.setup_anndata(
    brain_concat,
    layer="counts",
    batch_key="tech",
    labels_key="celltype_scanvi",
    unlabeled_category="celltype"
)


lvae_brain = scvi.model.SCANVI.from_scvi_model(vae_brain, "Unknown", adata=brain_concat,labels_key="celltype_scanvi")

lvae_brain.train(max_epochs=20, n_samples_per_label=100)

In [None]:
# Get the predicted annotations and latent representation
brain_concat.obs["C_scANVI"] = lvae_brain.predict(brain_concat)
brain_concat.obsm["X_scANVI"] = lvae_brain.get_latent_representation(brain_concat)

# Calculate the UMAP in the joint latent space
sc.pp.neighbors(brain_concat, use_rep="X_scANVI")
sc.tl.umap(brain_concat)

# Plot the joint UMAP
sc.pl.umap(brain_concat, color=['tech', 'C_scANVI'])

In [None]:
test = brain_concat[brain_concat.obs.tech == 'Vizgen']
test.obsm['spatial'] = test.obs[["x", "y"]].values
test.obs['celltype'] = test.obs.C_scANVI

In [None]:
sc.pp.neighbors(test)
sc.tl.umap(test)
sc.tl.leiden(test,resolution=1)
# Plot the joint UMAP
sc.pl.umap(test, color=['celltype'])

test.write_h5ad('../Baysor/full_dataset_SCANVI.h5ad')