In [None]:
import os
import tempfile
import numpy as np
import muon as mu
import anndata as ad
import torch
from lightning.pytorch import Trainer
import matplotlib.pyplot as plt
import scanpy as sc

import scvi
from scvi.external import SysVI
from preprocessing import set_random_seed


path = os.path.abspath('').replace('\\', '/')+'/'
data_path = path+'dataset/'
save_path = os.path.abspath('').replace('\\', '/')+'/results/'
print("Last run with scvi-tools version:", scvi.__version__)

## sysVI

In [None]:
for dataset in ["liver_human", "liver_Nafld", "glio", "adipose"]: 
    if dataset == "glio" or dataset == "adipose":
        context_key = 'mouse'
        target_key = 'human' 
        load_key = dataset

    elif dataset == "liver_human":
        context_key = 'mouse'
        target_key = 'human'
        load_key = 'liver'    

    elif dataset == "liver_Nafld":
        context_key = 'mouse'
        target_key = 'mouseNafld'    
        load_key = 'liver'        

    for i in range(10):
        scvi.settings.seed = i*1234
        set_random_seed(i*1234)
    
        mdata = mu.read_h5mu(data_path+load_key+".h5mu") 

        mdata[context_key].obs['system'] = 0
        mdata[target_key].obs['system'] = 1

        context_genes = np.array(mdata[context_key].var['human_gene_names'])
        target_genes = np.array(mdata[target_key].var['human_gene_names'])

        ret_vec, ind_a, ind_b =  np.intersect1d(target_genes, context_genes, return_indices=True)
        mdata_target = mdata[target_key][:, ind_a]
        mdata_context = mdata[context_key][:, ind_b]

        mdata_target.var_names = target_genes[ind_a]
        mdata_context.var_names = context_genes[ind_b]
        
        sc.pp.normalize_total(mdata_target, target_sum=1e4)
        sc.pp.normalize_total(mdata_context, target_sum=1e4)

        sc.pp.log1p(mdata_target)
        sc.pp.log1p(mdata_context)

        adata = ad.concat([mdata_context, mdata_target], axis=0, join='inner')
        
        SysVI.setup_anndata(
            adata=adata,
            batch_key="system",
            categorical_covariate_keys=["batch"],
        )

        model = SysVI(adata=adata, n_layers=2, n_hidden=256, dropout_rate=0.1, n_latent=10)

        model.train(
            max_epochs=30,
            log_every_n_steps=1,
            check_val_every_n_epoch=1,
            val_check_interval=1.0,
        )
        
        model.save(save_path + f"{dataset}_sysvi_model_{i}", overwrite=True)
   
        embed = model.get_latent_representation(adata=adata)
        embed = sc.AnnData(embed, obs=adata.obs)
        embed.obs["system"] = embed.obs["system"].map({0: "mouse", 1: "human"})
        
        embed.write(data_path+dataset+'_embed_sysVI_'+str(i)+'.h5ad')

## Celltypeist

In [None]:
#pip install celltypist

import celltypist
from celltypist import models

models.download_models()
#models.download_models(force_update = True)


for cell_type in ["cell_type_fine", "cell_type_coarse"]: 
    for dataset in ["liver_human", "liver_Nafld", "adipose", "glio"]: 
        if dataset == "glio" or dataset == "adipose":
            context_key = 'mouse'
            target_key = 'human' 
            load_key = dataset

        elif dataset == "liver_human":
            context_key = 'mouse'
            target_key = 'human'
            load_key = 'liver'    

        elif dataset == "liver_Nafld":
            context_key = 'mouse'
            target_key = 'mouseNafld'    
            load_key = 'liver'        

        context = mu.read_h5mu(data_path+load_key+".h5mu").mod[context_key]    
        target = mu.read_h5mu(data_path+load_key+".h5mu").mod[target_key]
        
        target.var_names = target.var['mouse_gene_names']
        
        sc.pp.normalize_total(context, target_sum=1e4)
        sc.pp.log1p(context)
   
        sc.pp.normalize_total(target, target_sum=1e4)
        sc.pp.log1p(target)    
        
        new_model = celltypist.train(context, labels=cell_type, n_jobs=10, feature_selection=True)
        new_model.write(save_path+dataset+'_'+cell_type+'_custom_model.pkl')
        
        predictions = celltypist.annotate(target, model = new_model, majority_voting = True)
        predictions.predicted_labels.to_csv(save_path+dataset+'_'+cell_type+'_predictions.csv')