# SATURN

Install the SATURN package via https://github.com/snap-stanford/SATURN and obtain mouse-human macrogenes before running this script by following their instructions.
Change the save paths below accordingly:

In [None]:
mouse_embedding_path = "path/protein_embeddings/data/Mus_musculus.GRCm39.gene_symbol_to_embedding_ESM1b.pt"
human_embedding_path = "path/protein_embeddings/data/Homo_sapiens.GRCh38.gene_symbol_to_embedding_ESM1b.pt"

In [None]:
import anndata as ad
import numpy as np
import pandas as pd
import muon as mu
import scanpy as sc
import subprocess
import os
from pathlib import Path

%load_ext autoreload
%autoreload 2

path = os.path.abspath('').replace('\\', '/')+'/'
data_path = path+'dataset/'
save_path = os.path.abspath('').replace('\\', '/')+'/results/'

In [None]:
for i in range(10):
    for dataset in ["liver_human", "adipose", "glio"]: #
        if dataset == "glio" or dataset == "adipose" or dataset == "glio_full" or dataset == "adipose_full":
            context_key = 'mouse'
            target_key = 'human' 
            load_key = dataset

        elif dataset == "liver_human" or dataset == "liver_human_full":
            context_key = 'mouse'
            target_key = 'human'
            load_key = 'liver'
            
        #file_path = dir_path+"/saturn_results/fz_centroids.pkl"
        #if os.path.exists(file_path):
        #    os.remove(file_path)            

        metric_dir = Path("saturn_results")
        metric_dir.mkdir(parents=True, exist_ok=True) 
  
        mdata = mu.read_h5mu(data_path+load_key+".h5mu")     
        adata_mouse = mdata.mod[context_key]
        adata_human = mdata.mod[target_key]
        
        sc.pp.normalize_total(adata_human, target_sum=1e4)
        sc.pp.log1p(adata_human)

        sc.pp.neighbors(adata_human, use_rep='X')

        sc.tl.leiden(adata_human)
        adata_mouse.obs.rename(columns={"cell_type_fine": "cell_type"}, inplace=True)
        adata_human.obs.rename(columns={"leiden": "cell_type"}, inplace=True)
        
        mouse_cells = np.array(adata_mouse.obs.cell_type.unique())
        human_cells = np.array(adata_human.obs.cell_type.unique())

        a = np.intersect1d(mouse_cells, human_cells)
        b = np.setdiff1d(mouse_cells, human_cells)
        c = np.full_like(b, np.nan, dtype=float)
        d = np.setdiff1d(human_cells, mouse_cells)
        e = np.full_like(d, np.nan, dtype=float)

        mouse_cell_type = np.concatenate((np.concatenate((a, b)), e))
        human_cell_type = np.concatenate((np.concatenate((a, c)), d))
        index = np.arange(len(mouse_cell_type))

        mouse_human_cell_type_map = pd.DataFrame({
            'Unnamed: 0': np.arange(len(mouse_cell_type)),
            'mouse_cell_type': np.concatenate((np.concatenate((a, b)), e)),
            'human_cell_type': np.concatenate((np.concatenate((a, c)), d))}
                                                )

        mouse_human_cell_type_map.to_csv("data/mouse_human_cell_type_map.csv")
        adata_mouse.write("data/"+context_key+'_'+dataset+".h5ad")
        adata_human.write("data/"+target_key+'_'+dataset+".h5ad")
        
        df = pd.DataFrame(columns=["path", "species", "embedding_path"])
        df["species"] = ["mouse", "human"]
        df["path"] = ["data/"+context_key+'_'+dataset+".h5ad", "data/"+target_key+'_'+dataset+".h5ad"]

        df["embedding_path"] = [mouse_embedding_path, human_embedding_path]
        df.to_csv("data/mouse_human_run.csv", index=False)            
        

        command = [
            "python3", "../../train-saturn.py",
            f"--in_data=data/mouse_human_run.csv",
            "--in_label_col=cell_type", "--ref_label_col=cell_type",
            "--num_macrogenes=2000", "--hv_genes=4000",
            "--centroids_init_path=saturn_results/fz_centroids.pkl", 
            "--ct_map_path=data/mouse_human_cell_type_map.csv",
            "--work_dir=.", f"--seed={i*1234}", "--device_num=7",
        ]
        subprocess.run(command)                       

In [None]:
for i in range(10):
    for dataset in ["liver_Nafld"]: #,    
        if dataset == "glio" or dataset == "adipose":
            context_key = 'mouse'
            target_key = 'human' 
            load_key = dataset

        elif dataset == "liver_human" or dataset == "liver_human_full":
            context_key = 'mouse'
            target_key = 'human'
            load_key = 'liver'
            
        #file_path = dir_path+"/saturn_results/fz_centroids.pkl"
        #if os.path.exists(file_path):
        #    os.remove(file_path)            

        metric_dir = Path("saturn_results")
        metric_dir.mkdir(parents=True, exist_ok=True) 
    
        adata_mouse = ad.read_h5ad(data_path+load_key+'_'+target_key+".h5ad")     
        adata_human = ad.read_h5ad(data_path+load_key+'_'+target_key+".h5ad")    

        adata_mouse.obs.rename(columns={"cell_type_fine": "cell_type"}, inplace=True)
        adata_human.obs.rename(columns={"cell_type_fine": "cell_type"}, inplace=True)

        mouse_cells = np.array(adata_mouse.obs.cell_type.unique())
        human_cells = np.array(adata_human.obs.cell_type.unique())

        a = np.intersect1d(mouse_cells, human_cells)
        b = np.setdiff1d(mouse_cells, human_cells)
        c = np.full_like(b, np.nan, dtype=float)
        d = np.setdiff1d(human_cells, mouse_cells)
        e = np.full_like(d, np.nan, dtype=float)

        mouse_cell_type = np.concatenate((np.concatenate((a, b)), e))
        human_cell_type = np.concatenate((np.concatenate((a, c)), d))
        index = np.arange(len(mouse_cell_type))

        mouse_human_cell_type_map = pd.DataFrame({
            'Unnamed: 0': np.arange(len(mouse_cell_type)),
            'mouse_cell_type': np.concatenate((np.concatenate((a, b)), e)),
            'human_cell_type': np.concatenate((np.concatenate((a, c)), d))}
                                                )
        mouse_human_cell_type_map.to_csv("data/mouse_human_cell_type_map.csv")
        adata_mouse.write("data/"+context_key+'_'+dataset+".h5ad")
        adata_human.write("data/"+target_key+'_'+dataset+".h5ad")
        
        df = pd.DataFrame(columns=["path", "species", "embedding_path"])
        df["species"] = ["mouse", "human"]
        df["path"] = ["data/"+context_key+'_'+dataset+".h5ad", "data/"+target_key+'_'+dataset+".h5ad"]
        df["embedding_path"] = [mouse_embedding_path, human_embedding_path]
        df.to_csv("data/mouse_human_run.csv", index=False)            
        

        command = [
            "python3", "../../train-saturn.py",
            f"--in_data=data/mouse_human_run.csv",
            "--in_label_col=cell_type", "--ref_label_col=cell_type",
            "--num_macrogenes=2000", "--hv_genes=4000",
            "--centroids_init_path=saturn_results/fz_centroids.pkl",
            "--score_adata", "--ct_map_path=data/mouse_human_cell_type_map.csv",
            "--work_dir=.", f"--seed={i*1234}", "--device_num=7",
        ]
        subprocess.run(command)    